From c69d2bbeddea61acfb382ea53c40e6ebdfa5c85d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 26 Oct 2018 19:20:27 +0800 Subject: [PATCH 001/719] Add base impl --- .../operators/fused_embedding_seq_pool_op.cc | 158 +++++++++++++ .../operators/fused_embedding_seq_pool_op.h | 207 ++++++++++++++++++ 2 files changed, 365 insertions(+) create mode 100644 paddle/fluid/operators/fused_embedding_seq_pool_op.cc create mode 100644 paddle/fluid/operators/fused_embedding_seq_pool_op.h diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc new file mode 100644 index 0000000000..ea96078291 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -0,0 +1,158 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused_embedding_seq_pool_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class LookupTableOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + + auto output_dims = + framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + output_dims.push_back(table_dims[1]); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); + AddAttr("is_distributed", + "(boolean, default false) distributed lookup table.") + .SetDefault(false); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); + AddComment(R"DOC( +Lookup Table Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +class LookupTableOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { return "lookup_table_grad"; } +}; + +class LookupTableOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, + ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); +REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, + ops::LookupTableOpGradVarTypeInference); + +// REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, +// ops::LookupTableKernel); +// REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, +// ops::LookupTableGradKernel); +REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel); +REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel); diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h new file mode 100644 index 0000000000..6dcf4f44a7 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +constexpr int64_t kNoPadding = -1; + +template +class LookupTableKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); // int tensor + auto *output_t = context.Output("Out"); // float tensor + auto *table_var = context.InputVar("W"); + + int64_t padding_idx = context.Attr("padding_idx"); + int64_t *ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); + + if (table_var->IsType()) { + auto *table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], row_number); + PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } + } + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto *table = table_t.value().data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_GE(ids[i], 0); + auto id_index = table_t.Index(ids[i]); + PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); + // memcpy(output + i * row_width, table + id_index * row_width, + // row_width * sizeof(T)); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); + } + } + } + } +}; + +template +class LookupTableGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter W of a LookupTable " + "must be either LoDTensor or SelectedRows"); + } + + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + // auto start = std::chrono::system_clock::now(); + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + // auto end = std::chrono::system_clock::now(); + // std::chrono::duration diff = end - start; + + // auto copy_start = std::chrono::system_clock::now(); + std::vector new_rows; + new_rows.resize(ids_num); + std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); + // for (int64_t i = 0; i < ids_num; i++) { + // new_rows.push_back(ids_data[i]); + // } + // auto copy_end = std::chrono::system_clock::now(); + // std::chrono::duration copy_diff = copy_end - copy_start; + // diff += copy_diff; + // LOG(ERROR) << "run emb_grad copy end, cost: " << copy_diff.count() << " + // " << ids_num; + + // copy_start = std::chrono::system_clock::now(); + d_table->set_rows(new_rows); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + d_table_value->ShareDataWith(*d_output); + // d_table_value->mutable_data(context.GetPlace()); + + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad resize table end, cost: " << + // // copy_diff.count() << " " << ids_num; + + // // copy_start = std::chrono::system_clock::now(); + // d_table->set_height(table_dim[0]); + + // auto *d_output_data = d_output->data(); + // auto *d_table_data = d_table_value->data(); + + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad set height end, cost: " << + // // copy_diff.count() << " " << ids_num; + + // auto d_output_dims = d_output->dims(); + // PADDLE_ENFORCE_EQ( + // d_table_value->dims(), + // framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); + // // copy_start = std::chrono::system_clock::now(); + // auto blas = math::GetBlas(context); + // blas.VCOPY(d_output->numel(), d_output_data, d_table_data); + // cblas_scopy(d_output->numel(), d_output_data, 1, d_table_data, 1); + // // for (int i = 0; i != d_output->numel(), ++i) { + // // *(d_table_data++) = *(d_output_data++); + // // } + // // memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + // // copy_end = std::chrono::system_clock::now(); + // // copy_diff = copy_end - copy_start; + // // diff += copy_diff; + // // LOG(ERROR) << "run emb_grad core end, cost: " << copy_diff.count() + // << " + // // " << ids_num << " " << d_output->numel(); + + // // LOG(ERROR) << "run emb_grad end, cost: " << diff.count(); + } else { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + + int N = table_dim[0]; + int D = table_dim[1]; + + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table->mutable_data(context.GetPlace()); + + memset(d_table_data, 0, d_table->numel() * sizeof(T)); + + for (int64_t i = 0; i < ids->numel(); ++i) { + PADDLE_ENFORCE_LT(ids_data[i], N); + PADDLE_ENFORCE_GE(ids_data[i], 0); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle From 6db8c3bfeafca8b1522de32f56c450db473bd3e9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 5 Nov 2018 15:31:19 +0800 Subject: [PATCH 002/719] Implement the infer shape and infer var type --- .../operators/fused_embedding_seq_pool_op.cc | 116 +++++++++++------- .../operators/fused_embedding_seq_pool_op.h | 2 - 2 files changed, 70 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index ea96078291..5ebaf865fc 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -18,34 +18,53 @@ limitations under the License. */ namespace paddle { namespace operators { -class LookupTableOp : public framework::OperatorWithKernel { +class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("W"), - "Input(W) of LookupTableOp should not be null."); + "Input W of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Ids"), - "Input(Ids) of LookupTableOp should not be null."); + "Input Ids of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of LookupTableOp should not be null."); + "Output of FusedEmbeddingSeqPoolOp should not be null."); auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); - int ids_rank = ids_dims.size(); + const std::string& combiner = ctx->Attrs().Get("combiner"); PADDLE_ENFORCE_EQ(table_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + PADDLE_ENFORCE_GE(ids_dims.size(), 1u, + "The dim size of the 'Ids' tensor must greater than 1."); + PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, "The last dimension of the 'Ids' tensor must be 1."); + // we only support sum now + PADDLE_ENFORCE_EQ(combiner, "sum"); - auto output_dims = - framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); - output_dims.push_back(table_dims[1]); - ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + if (ctx->IsRuntime()) { + Variable* ids_var = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + const auto& ids_lod = ids_var->Get().lod(); - if (ctx->GetOutputsVarType("Out")[0] == - framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("Ids", /*->*/ "Out"); + // in run time, the LoD of ids must be 1 + PADDLE_ENFORCE(ids_lod.size(), 1u, + "The LoD level of Input(Ids) must be 1"); + PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); + + size_t batch_size = ids_lod[0].size() - 1; + + // in run time, the shape from Ids -> output + // should be [seq_length, 1] -> [batch_size, embedding_size] + ctx->SetOutputDim("Out", + framework::make_ddim({batch_size, table_dims[1]})); + } else { + // in compile time, the lod level of ids must be 1 + VarDesc* ids_desc = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); + + // in compile time, the shape from Ids -> output + // should be [-1, 1] -> [-1, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({-1, table_dims[1]})); } } @@ -57,7 +76,7 @@ class LookupTableOp : public framework::OperatorWithKernel { } }; -class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { +class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("W", @@ -68,42 +87,44 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "contains the ids to be looked up in W. " "The last dimension size must be 1."); AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("combiner", + "(string, default sum) " + "A string specifying the reduction op. Currently sum " + "are supported, sum computes the weighted sum of the " + "embedding results for each row.") + .SetDefault("sum"); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") .SetDefault(false); - AddAttr("is_distributed", - "(boolean, default false) distributed lookup table.") - .SetDefault(false); - AddAttr("padding_idx", - "(int64, default -1) " - "If the value is -1, it makes no effect to lookup. " - "Otherwise the given value indicates padding the output " - "with zeros whenever lookup encounters it in Ids.") - .SetDefault(kNoPadding); AddComment(R"DOC( -Lookup Table Operator. +FusedEmbeddingSeqPool Operator. + +Computes embeddings for the given ids and weights. This operator is used to perform lookups on the parameter W, -then concatenated into a dense tensor. +then computes the weighted sum of the lookups results for each row +and concatenated into a dense tensor. -The input Ids can carry the LoD (Level of Details) information, -or not. And the output only shares the LoD information with input Ids. +The input Ids should carry the LoD (Level of Details) information. +And the output will change the LoD information with input Ids. )DOC"); } }; -class LookupTableOpGradDescMaker +class FusedEmbeddingSeqPoolOpGradDescMaker : public framework::DefaultGradOpDescMaker { using ::paddle::framework::DefaultGradOpDescMaker< true>::DefaultGradOpDescMaker; protected: - virtual std::string GradOpType() const { return "lookup_table_grad"; } + virtual std::string GradOpType() const { + return "fused_embedding_seq_pool_grad"; + } }; -class LookupTableOpGrad : public framework::OperatorWithKernel { +class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -120,7 +141,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { } }; -class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { +class FusedEmbeddingSeqPoolOpGradVarTypeInference + : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { @@ -128,13 +150,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to SelectedRows"; + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to SelectedRows"; block->Var(out_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to LoDTensor"; + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); @@ -145,14 +167,16 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, - ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); -REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, - ops::LookupTableOpGradVarTypeInference); - -// REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, -// ops::LookupTableKernel); -// REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, -// ops::LookupTableGradKernel); -REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel); -REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel); +REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp, + ops::FusedEmbeddingSeqPoolOpGradDescMaker, + ops::FusedEmbeddingSeqPoolOpMaker); +REGISTER_OPERATOR(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolOpGrad, + ops::FusedEmbeddingSeqPoolOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool, + ops::FusedEmbeddingSeqPoolKernel, + ops::FusedEmbeddingSeqPoolKernel); +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolGradKernel, + ops::FusedEmbeddingSeqPoolGradKernel); diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 6dcf4f44a7..24cffc60a8 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,8 +31,6 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -constexpr int64_t kNoPadding = -1; - template class LookupTableKernel : public framework::OpKernel { public: From 17c8014fcd2071920a605f12951d4f6ae1ddcab9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 17:42:43 +0800 Subject: [PATCH 003/719] Complete implementation test=develop --- .../operators/fused_embedding_seq_pool_op.cc | 6 + .../operators/fused_embedding_seq_pool_op.h | 182 ++++++------------ 2 files changed, 63 insertions(+), 125 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index 5ebaf865fc..e862769051 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -93,6 +93,12 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "are supported, sum computes the weighted sum of the " "embedding results for each row.") .SetDefault("sum"); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 24cffc60a8..5af234b937 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,62 +31,54 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; +template +struct EmbeddingVSumFunctor { + void operator()(const DeviceContext &context, LoDTensor *table_t, + LoDTensor *ids_t, LoDTensor *output_t) { + auto *table = table_t->data(); + int64_t row_number = table->dims()[0]; + int64_t row_width = table->dims()[1]; + int64_t *ids = const_cast(ids_t->data()); + auto ids_lod = ids_t->LoD()[0]; + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i]; + + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin] * row_width, + output + i * row_width); + + for (int64_t r = ids_lod[i] + 1; r < ids_lod[i + 1]; ++r) { + PADDLE_ENFORCE_LT(ids[r], row_number); + PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); + blas.AXPY(row_width, 1., table + ids[r] * row_width, + output + i * row_width); + } + } + } +}; + template -class LookupTableKernel : public framework::OpKernel { +class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto *ids_t = context.Input("Ids"); // int tensor - auto *output_t = context.Output("Out"); // float tensor - auto *table_var = context.InputVar("W"); - - int64_t padding_idx = context.Attr("padding_idx"); - int64_t *ids = const_cast(ids_t->data()); - int64_t ids_numel = ids_t->numel(); - - if (table_var->IsType()) { - auto *table_t = context.Input("W"); - int64_t row_number = table_t->dims()[0]; - int64_t row_width = table_t->dims()[1]; - - auto *table = table_t->data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_LT(ids[i], row_number); - PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); - memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(T)); - } - } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); - int64_t row_width = table_t.value().dims()[1]; - const auto *table = table_t.value().data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - auto blas = math::GetBlas(context); - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_GE(ids[i], 0); - auto id_index = table_t.Index(ids[i]); - PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); - // memcpy(output + i * row_width, table + id_index * row_width, - // row_width * sizeof(T)); - blas.VCOPY(row_width, table + id_index * row_width, - output + i * row_width); - } - } + LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + LoDTensor *table_var = context.Input("W"); + const std::string &combiner_type = context.Attr("combiner"); + + if (combiner_type == "sum") { + EmbeddingVSumFunctor functor; + functor(context.template device_context(), ids_t, output_t, table_var); } } }; template -class LookupTableGradKernel : public framework::OpKernel { +class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *table_var = context.InputVar("W"); @@ -106,97 +98,37 @@ class LookupTableGradKernel : public framework::OpKernel { // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { - // auto start = std::chrono::system_clock::now(); auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); auto *d_table = context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); - // auto end = std::chrono::system_clock::now(); - // std::chrono::duration diff = end - start; + auto lod = ids->lod()[0]; + int64_t row_width = table_dim[1]; - // auto copy_start = std::chrono::system_clock::now(); - std::vector new_rows; + framework::Vector new_rows; new_rows.resize(ids_num); std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); - // for (int64_t i = 0; i < ids_num; i++) { - // new_rows.push_back(ids_data[i]); - // } - // auto copy_end = std::chrono::system_clock::now(); - // std::chrono::duration copy_diff = copy_end - copy_start; - // diff += copy_diff; - // LOG(ERROR) << "run emb_grad copy end, cost: " << copy_diff.count() << " - // " << ids_num; - - // copy_start = std::chrono::system_clock::now(); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, table_dim[1]}); - d_table_value->ShareDataWith(*d_output); - // d_table_value->mutable_data(context.GetPlace()); - - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad resize table end, cost: " << - // // copy_diff.count() << " " << ids_num; - - // // copy_start = std::chrono::system_clock::now(); - // d_table->set_height(table_dim[0]); - - // auto *d_output_data = d_output->data(); - // auto *d_table_data = d_table_value->data(); - - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad set height end, cost: " << - // // copy_diff.count() << " " << ids_num; - - // auto d_output_dims = d_output->dims(); - // PADDLE_ENFORCE_EQ( - // d_table_value->dims(), - // framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); - // // copy_start = std::chrono::system_clock::now(); - // auto blas = math::GetBlas(context); - // blas.VCOPY(d_output->numel(), d_output_data, d_table_data); - // cblas_scopy(d_output->numel(), d_output_data, 1, d_table_data, 1); - // // for (int i = 0; i != d_output->numel(), ++i) { - // // *(d_table_data++) = *(d_output_data++); - // // } - // // memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); - // // copy_end = std::chrono::system_clock::now(); - // // copy_diff = copy_end - copy_start; - // // diff += copy_diff; - // // LOG(ERROR) << "run emb_grad core end, cost: " << copy_diff.count() - // << " - // // " << ids_num << " " << d_output->numel(); - - // // LOG(ERROR) << "run emb_grad end, cost: " << diff.count(); - } else { - auto *ids = context.Input("Ids"); - auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); - - auto *ids_data = ids->data(); - - int N = table_dim[0]; - int D = table_dim[1]; - - auto *d_output_data = d_output->data(); - auto *d_table_data = d_table->mutable_data(context.GetPlace()); - - memset(d_table_data, 0, d_table->numel() * sizeof(T)); - - for (int64_t i = 0; i < ids->numel(); ++i) { - PADDLE_ENFORCE_LT(ids_data[i], N); - PADDLE_ENFORCE_GE(ids_data[i], 0); - for (int j = 0; j < D; ++j) { - d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + d_table_value->Resize({ids_num, row_width}); + T *d_table_data = d_table_value->mutable_data(context.GetPlace()); + const T *d_output_data = d_output->data(); + + auto blas = math::GetBlas(context); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t in_offset = lod[i] * row_width; + const T *out_pos = d_output_data + i * row_width; + T *in_pos = d_table_data + in_offset; + for (int r = 0; r != h; ++r) { + blas.VCOPY(row_width, out_pos, in_pos + r * row_width); } } + } else { + LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; } } }; From 8a412c0d3308a0c9b90e8e7295ac117b6735b533 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 20:04:05 +0800 Subject: [PATCH 004/719] Complete impl --- .../operators/fused_embedding_seq_pool_op.cc | 18 ++++--- .../operators/fused_embedding_seq_pool_op.h | 49 +++++++++++-------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index e862769051..6b6b898d4c 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -42,8 +42,14 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { // we only support sum now PADDLE_ENFORCE_EQ(combiner, "sum"); + int64_t last_dim = table_dims[1]; + for (int i = 1; i != ids_dims.size(); ++i) { + last_dim *= ids_dims[i]; + } + if (ctx->IsRuntime()) { - Variable* ids_var = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + framework::Variable* ids_var = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); const auto& ids_lod = ids_var->Get().lod(); // in run time, the LoD of ids must be 1 @@ -51,20 +57,20 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { "The LoD level of Input(Ids) must be 1"); PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); - size_t batch_size = ids_lod[0].size() - 1; + int64_t batch_size = ids_lod[0].size() - 1; // in run time, the shape from Ids -> output // should be [seq_length, 1] -> [batch_size, embedding_size] - ctx->SetOutputDim("Out", - framework::make_ddim({batch_size, table_dims[1]})); + ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim})); } else { // in compile time, the lod level of ids must be 1 - VarDesc* ids_desc = boost::get(ctx->GetInputVarPtrs("Ids")[0]); + framework::VarDesc* ids_desc = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); // in compile time, the shape from Ids -> output // should be [-1, 1] -> [-1, embedding_size] - ctx->SetOutputDim("Out", framework::make_ddim({-1, table_dims[1]})); + ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); } } diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 5af234b937..7385c8da33 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -31,31 +31,38 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -template +template struct EmbeddingVSumFunctor { - void operator()(const DeviceContext &context, LoDTensor *table_t, - LoDTensor *ids_t, LoDTensor *output_t) { + void operator()(const framework::ExecutionContext &context, + const LoDTensor *table_t, const LoDTensor *ids_t, + LoDTensor *output_t) { auto *table = table_t->data(); - int64_t row_number = table->dims()[0]; - int64_t row_width = table->dims()[1]; + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + int64_t last_dim = output_t->dims()[1]; int64_t *ids = const_cast(ids_t->data()); - auto ids_lod = ids_t->LoD()[0]; + auto ids_lod = ids_t->lod()[0]; + int64_t ids_count = ids_t->numel() / ids_lod.back(); + auto *output = output_t->mutable_data(context.GetPlace()); - auto blas = math::GetBlas(context); + auto blas = math::GetBlas(context); for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { - size_t begin = ids_lod[i]; + for (int64_t j = 0; j != ids_count; ++j) { + size_t begin = ids_lod[i] * ids_count; - PADDLE_ENFORCE_LT(ids[begin], row_number); - PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); - blas.VCOPY(row_width, table + ids[begin] * row_width, - output + i * row_width); + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin] * row_width, + output + i * last_dim + j * row_width); + } - for (int64_t r = ids_lod[i] + 1; r < ids_lod[i + 1]; ++r) { + for (int64_t r = (ids_lod[i] + 1) * ids_count; + r < ids_lod[i + 1] * ids_count; ++r) { PADDLE_ENFORCE_LT(ids[r], row_number); PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); blas.AXPY(row_width, 1., table + ids[r] * row_width, - output + i * row_width); + output + i * row_width + (r % ids_count) * row_width); } } } @@ -65,14 +72,14 @@ template class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - LoDTensor *ids_t = context.Input("Ids"); // int tensor - LoDTensor *output_t = context.Output("Out"); // float tensor - LoDTensor *table_var = context.Input("W"); + const LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + const LoDTensor *table_var = context.Input("W"); const std::string &combiner_type = context.Attr("combiner"); if (combiner_type == "sum") { EmbeddingVSumFunctor functor; - functor(context.template device_context(), ids_t, output_t, table_var); + functor(context, table_var, ids_t, output_t); } } }; @@ -105,7 +112,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); auto lod = ids->lod()[0]; - int64_t row_width = table_dim[1]; + int64_t row_width = d_output->dims()[1]; framework::Vector new_rows; new_rows.resize(ids_num); @@ -113,11 +120,11 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_num, row_width}); + d_table_value->Resize({ids_num, table_dim[1]}); T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto blas = math::GetBlas(context); + auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); int64_t in_offset = lod[i] * row_width; From 3d784c27011a127de3c5730d8ee121102fadba6f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 20:05:18 +0800 Subject: [PATCH 005/719] Polish code --- paddle/fluid/operators/fused_embedding_seq_pool_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc index 6b6b898d4c..966bdb4df5 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.cc @@ -35,7 +35,7 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { const std::string& combiner = ctx->Attrs().Get("combiner"); PADDLE_ENFORCE_EQ(table_dims.size(), 2); - PADDLE_ENFORCE_GE(ids_dims.size(), 1u, + PADDLE_ENFORCE_GE(ids_dims.size(), 1, "The dim size of the 'Ids' tensor must greater than 1."); PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, "The last dimension of the 'Ids' tensor must be 1."); From 0f91beefd1f70b1596e657ab4cbf77c3d2c9a574 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 23:23:09 +0800 Subject: [PATCH 006/719] Fix bug test=develop --- paddle/fluid/operators/fused_embedding_seq_pool_op.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index 7385c8da33..f37c688395 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -53,7 +53,7 @@ struct EmbeddingVSumFunctor { PADDLE_ENFORCE_LT(ids[begin], row_number); PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); - blas.VCOPY(row_width, table + ids[begin] * row_width, + blas.VCOPY(row_width, table + ids[begin + j] * row_width, output + i * last_dim + j * row_width); } @@ -62,7 +62,7 @@ struct EmbeddingVSumFunctor { PADDLE_ENFORCE_LT(ids[r], row_number); PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); blas.AXPY(row_width, 1., table + ids[r] * row_width, - output + i * row_width + (r % ids_count) * row_width); + output + i * last_dim + (r % ids_count) * row_width); } } } From 849fbc7327935cfbe43f85744e71db515efa760d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 6 Nov 2018 23:23:33 +0800 Subject: [PATCH 007/719] Add unittest test=develop --- .../unittests/test_fused_emb_seq_pool_op.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py new file mode 100644 index 0000000000..584e309bef --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +import paddle.compat as cpt + + +class TestFusedEmbeddingSeqPoolOp(OpTest): + def setUp(self): + self.op_type = "fused_embedding_seq_pool" + self.emb_size = 2 + table = np.random.random((17, self.emb_size)).astype("float32") + ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]], + [[16], [1]]]).astype("int64") + merged_ids = np.array([4, 2, 16]).astype("int64") + ids_expand = np.expand_dims(ids, axis=1) + self.lod = [[3, 1]] + self.attrs = {'is_sparse': True} + self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)} + self.outputs = { + 'Out': np.reshape( + np.array([ + table[[4, 3]] + table[[4, 3]] + table[[2, 1]], + table[[16, 1]] + ]), [len(self.lod[0]), 2 * self.emb_size]) + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() From 5d0b568ecb58d479619c5a2295d65b7f677d4648 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 6 Nov 2018 18:42:19 +0800 Subject: [PATCH 008/719] Add YOLOv3 loss operator. test=develop --- paddle/fluid/operators/yolov3_loss_op.cc | 130 +++++++++ paddle/fluid/operators/yolov3_loss_op.cu | 23 ++ paddle/fluid/operators/yolov3_loss_op.h | 340 +++++++++++++++++++++++ 3 files changed, 493 insertions(+) create mode 100644 paddle/fluid/operators/yolov3_loss_op.cc create mode 100644 paddle/fluid/operators/yolov3_loss_op.cu create mode 100644 paddle/fluid/operators/yolov3_loss_op.h diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc new file mode 100644 index 0000000000..b4c6a185e2 --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/yolov3_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class Yolov3LossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTBox"), + "Input(GTBox) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Yolov3LossOp should not be null."); + + // PADDLE_ENFORCE(ctx->HasAttr("img_height"), + // "Attr(img_height) of Yolov3LossOp should not be null. "); + // PADDLE_ENFORCE(ctx->HasAttr("anchors"), + // "Attr(anchor) of Yolov3LossOp should not be null.") + // PADDLE_ENFORCE(ctx->HasAttr("class_num"), + // "Attr(class_num) of Yolov3LossOp should not be null."); + // PADDLE_ENFORCE(ctx->HasAttr( + // "ignore_thresh", + // "Attr(ignore_thresh) of Yolov3LossOp should not be null.")); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_gt = ctx->GetInputDim("GTBox"); + auto img_height = ctx->Attrs().Get("img_height"); + auto anchors = ctx->Attrs().Get>("anchors"); + auto box_num = ctx->Attrs().Get("box_num"); + auto class_num = ctx->Attrs().Get("class_num"); + PADDLE_ENFORCE_GT(img_height, 0, + "Attr(img_height) value should be greater then 0"); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater then 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + PADDLE_ENFORCE_GT(box_num, 0, + "Attr(box_num) should be an integer greater then 0."); + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater then 0."); + PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); + + std::vector dim_out({dim_x[0], 1}); + ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } +}; + +class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of bilinear interpolation, " + "This is a 4-D tensor with shape of [N, C, H, W]"); + AddOutput("Out", + "The output yolo loss tensor, " + "This is a 2-D tensor with shape of [N, 1]"); + + AddAttr("box_num", "The number of boxes generated in each grid."); + AddAttr("class_num", "The number of classes to predict."); + AddComment(R"DOC( + This operator generate yolov3 loss by given predict result and ground + truth boxes. + )DOC"); + } +}; + +class Yolov3LossOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto dim_x = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); +REGISTER_OP_CPU_KERNEL( + yolov3_loss, + ops::Yolov3LossKernel); +REGISTER_OP_CPU_KERNEL( + yolov3_loss_grad, + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.cu b/paddle/fluid/operators/yolov3_loss_op.cu new file mode 100644 index 0000000000..48f997456a --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/yolov3_loss_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + yolov3_loss, + ops::Yolov3LossOpKernel); +REGISTER_OP_CUDA_KERNEL( + yolov3_loss_grad, + ops::Yolov3LossGradOpKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h new file mode 100644 index 0000000000..7950390567 --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -0,0 +1,340 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; +template +using EigenVector = framework::EigenVector; + +using Array2 = Eigen::DSizes; +using Array4 = Eigen::DSizes; + +template +static inline bool isZero(T x) { + return abs(x) < 1e-6; +} + +template +static inline T sigmod(T x) { + return 1.0 / (exp(-1.0 * x) + 1.0); +} + +template +static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + auto result = ((x_t - y_t) * mask_t).pow(2).sum().eval(); + return result(0); +} + +template +static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + auto result = + ((y_t * (x_t.log()) + (1.0 - y_t) * ((1.0 - x_t).log())) * mask_t) + .sum() + .eval(); + return result; +} + +template +static inline T CalcCEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); +} + +template +static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, + Tensor* pred_confs, Tensor* pred_classes, + Tensor* pred_x, Tensor* pred_y, Tensor* pred_w, + Tensor* pred_h, std::vector anchors, + const int class_num, const int stride) { + const int n = input.dims()[0]; + const int c = input.dims()[1]; + const int h = input.dims()[2]; + const int w = input.dims()[3]; + const int anchor_num = anchors.size() / 2; + const int box_attr_num = 5 + class_num; + + auto input_t = EigenTensor::From(input); + auto pred_boxes_t = EigenTensor::From(*pred_boxes); + auto pred_confs_t = EigenTensor::From(*pred_confs); + auto pred_classes_t = EigenTensor::From(*pred_classes); + auto pred_x_t = EigenTensor::From(*pred_x); + auto pred_y_t = EigenTensor::From(*pred_y); + auto pred_w_t = EigenTensor::From(*pred_w); + auto pred_h_t = EigenTensor::From(*pred_h); + + for (int i = 0; i < n; i++) { + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + float an_w = anchors[an_idx * 2] / stride; + float an_h = anchors[an_idx * 2 + 1] / stride; + + for (int j = 0; j < h; j++) { + for (int k = 0; k < w; k++) { + pred_x_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx, j, k)); + pred_y_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 1, j, k)); + pred_w_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 2, j, k)); + pred_h_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 3, j, k)); + + pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; + pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; + pred_boxes_t(i, an_idx, j, k, 2) = + exp(pred_w_t(i, an_idx, j, k)) * an_w; + pred_boxes_t(i, an_idx, j, k, 3) = + exp(pred_h_t(i, an_idx, j, k)) * an_h; + + pred_confs_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); + + for (int c = 0; c < class_num; c++) { + pred_classes_t(i, an_idx, j, k, c) = + sigmod(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); + } + } + } + } + } +} + +template +static T CalcBoxIoU(std::vector box1, std::vector box2, + bool center_mode) { + T b1_x1, b1_x2, b1_y1, b1_y2; + T b2_x1, b2_x2, b2_y1, b2_y2; + if (center_mode) { + b1_x1 = box1[0] - box1[2] / 2; + b1_x2 = box1[0] + box1[2] / 2; + b1_y1 = box1[1] - box1[3] / 2; + b1_y2 = box1[1] + box1[3] / 2; + b2_x1 = box2[0] - box2[2] / 2; + b2_x2 = box2[0] + box2[2] / 2; + b2_y1 = box2[1] - box2[3] / 2; + b2_y2 = box2[1] + box2[3] / 2; + } else { + b1_x1 = box1[0]; + b1_x2 = box1[1]; + b1_y1 = box1[2]; + b1_y2 = box1[3]; + b2_x1 = box2[0]; + b2_x2 = box2[0]; + b2_y1 = box2[1]; + b2_y2 = box2[1]; + } + T b1_area = (b1_x2 - b1_x1 + 1.0) * (b1_y2 - b1_y1 + 1.0); + T b2_area = (b2_x2 - b2_x1 + 1.0) * (b2_y2 - b2_y1 + 1.0); + + T inter_rect_x1 = std::max(b1_x1, b2_x1); + T inter_rect_y1 = std::max(b1_y1, b2_y1); + T inter_rect_x2 = std::min(b1_x2, b2_x2); + T inter_rect_y2 = std::min(b1_y2, b2_y2); + T inter_area = std::max(inter_rect_x2 - inter_rect_x1 + 1.0, 0.0) * + std::max(inter_rect_y2 - inter_rect_y1 + 1.0, 0.0); + + return inter_area / (b1_area + b2_area - inter_area + 1e-16); +} + +template +static inline int GetPredLabel(const Tensor& pred_classes, int n, + int best_an_index, int gj, int gi) { + auto pred_classes_t = EigenTensor::From(pred_classes); + T score = 0.0; + int label = -1; + for (int i = 0; i < pred_classes.dims()[4]; i++) { + if (pred_classes_t(n, best_an_index, gj, gi, i) > score) { + score = pred_classes_t(n, best_an_index, gj, gi, i); + label = i; + } + } + return label; +} + +template +static void CalcPredBoxWithGTBox( + const Tensor& pred_boxes, const Tensor& pred_confs, + const Tensor& pred_classes, const Tensor& gt_boxes, + std::vector anchors, const float ignore_thresh, const int img_height, + int* gt_num, int* correct_num, Tensor* mask_true, Tensor* mask_false, + Tensor* tx, Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, + Tensor* tclass) { + const int n = gt_boxes.dims()[0]; + const int b = gt_boxes.dims()[1]; + const int grid_size = pred_boxes.dims()[1]; + const int anchor_num = anchors.size() / 2; + auto pred_boxes_t = EigenTensor::From(pred_boxes); + auto pred_confs_t = EigenTensor::From(pred_confs); + auto pred_classes_t = EigenTensor::From(pred_classes); + auto gt_boxes_t = EigenTensor::From(gt_boxes); + auto mask_true_t = EigenTensor::From(*mask_true).setConstant(0.0); + auto mask_false_t = EigenTensor::From(*mask_false).setConstant(1.0); + auto tx_t = EigenTensor::From(*tx).setConstant(0.0); + auto ty_t = EigenTensor::From(*ty).setConstant(0.0); + auto tw_t = EigenTensor::From(*tw).setConstant(0.0); + auto th_t = EigenTensor::From(*th).setConstant(0.0); + auto tconf_t = EigenTensor::From(*tconf).setConstant(0.0); + auto tclass_t = EigenTensor::From(*tclass).setConstant(0.0); + + *gt_num = 0; + *correct_num = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < b; j++) { + if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && + isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3))) { + continue; + } + + *(gt_num)++; + int gt_label = gt_boxes_t(i, j, 0); + T gx = gt_boxes_t(i, j, 1); + T gy = gt_boxes_t(i, j, 2); + T gw = gt_boxes_t(i, j, 3); + T gh = gt_boxes_t(i, j, 4); + int gi = static_cast(gx); + int gj = static_cast(gy); + + T max_iou = static_cast(-1); + T iou; + int best_an_index = -1; + std::vector gt_box({0, 0, gw, gh}); + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), + static_cast(anchors[2 * an_idx + 1])}); + iou = CalcBoxIoU(gt_box, anchor_shape, false); + if (iou > max_iou) { + max_iou = iou; + best_an_index = an_idx; + } + if (iou > ignore_thresh) { + mask_false_t(b, an_idx, gj, gi) = 0; + } + } + mask_true_t(b, best_an_index, gj, gi) = 1; + mask_false_t(b, best_an_index, gj, gi) = 1; + tx_t(i, best_an_index, gj, gi) = gx - gi; + ty_t(i, best_an_index, gj, gi) = gy - gj; + tw_t(i, best_an_index, gj, gi) = + log(gw / anchors[2 * best_an_index] + 1e-16); + th_t(i, best_an_index, gj, gi) = + log(gh / anchors[2 * best_an_index + 1] + 1e-16); + tclass_t(b, best_an_index, gj, gi, gt_label) = 1; + tconf_t(b, best_an_index, gj, gi) = 1; + + std::vector pred_box({ + pred_boxes_t(i, best_an_index, gj, gi, 0), + pred_boxes_t(i, best_an_index, gj, gi, 1), + pred_boxes_t(i, best_an_index, gj, gi, 2), + pred_boxes_t(i, best_an_index, gj, gi, 3), + }); + gt_box[0] = gx; + gt_box[1] = gy; + iou = CalcBoxIoU(gt_box, pred_box, true); + int pred_label = GetPredLabel(pred_classes, i, best_an_index, gj, gi); + T score = pred_confs_t(i, best_an_index, gj, gi); + if (iou > 0.5 && pred_label == gt_label && score > 0.5) { + (*correct_num)++; + } + } + } + mask_false_t = mask_true_t - mask_false_t; +} + +template +class Yolov3LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_boxes = ctx.Input("GTBox"); + auto* output = ctx.Output("Out"); + int img_height = ctx.Attr("img_height"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + const float stride = static_cast(img_height) / h; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_boxes, pred_confs, pred_classes; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_boxes.mutable_data({n, an_num, h, w, 4}, ctx.GetPlace()); + pred_confs.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_classes.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_boxes, &pred_confs, &pred_classes, &pred_x, + &pred_y, &pred_w, &pred_h, anchors, class_num, stride); + + Tensor mask_true, mask_false; + Tensor tx, ty, tw, th, tconf, tclass; + mask_true.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + mask_false.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + int gt_num = 0; + int correct_num = 0; + CalcPredBoxWithGTBox(pred_boxes, pred_confs, pred_classes, *gt_boxes, + anchors, ignore_thresh, img_height, >_num, + &correct_num, &mask_true, &mask_false, &tx, &ty, + &tw, &th, &tconf, &tclass); + + T loss_x = CalcMSEWithMask(pred_x, tx, mask_true); + T loss_y = CalcMSEWithMask(pred_y, ty, mask_true); + T loss_w = CalcMSEWithMask(pred_w, tw, mask_true); + T loss_h = CalcMSEWithMask(pred_h, th, mask_true); + T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, mask_true); + T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, mask_false); + // T loss_class = CalcCEWithMask() + } +}; + +template +class Yolov3LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_input_t = ctx.Output(framework::GradVarName("X")); + auto* d_output_t = ctx.Input(framework::GradVarName("Out")); + } +}; + +} // namespace operators +} // namespace paddle From 77c1328fa749c900c7e12bd6b9d70e84b91d5f49 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 10 Nov 2018 23:32:11 +0800 Subject: [PATCH 009/719] add CPU kernel forward --- paddle/fluid/operators/yolov3_loss_op.cc | 60 ++++--- paddle/fluid/operators/yolov3_loss_op.h | 215 ++++++++++------------- 2 files changed, 127 insertions(+), 148 deletions(-) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index b4c6a185e2..9ed7e13dc7 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -27,18 +27,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Input(X) of Yolov3LossOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("GTBox"), "Input(GTBox) of Yolov3LossOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of Yolov3LossOp should not be null."); - - // PADDLE_ENFORCE(ctx->HasAttr("img_height"), - // "Attr(img_height) of Yolov3LossOp should not be null. "); - // PADDLE_ENFORCE(ctx->HasAttr("anchors"), - // "Attr(anchor) of Yolov3LossOp should not be null.") - // PADDLE_ENFORCE(ctx->HasAttr("class_num"), - // "Attr(class_num) of Yolov3LossOp should not be null."); - // PADDLE_ENFORCE(ctx->HasAttr( - // "ignore_thresh", - // "Attr(ignore_thresh) of Yolov3LossOp should not be null.")); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of Yolov3LossOp should not be null."); auto dim_x = ctx->GetInputDim("X"); auto dim_gt = ctx->GetInputDim("GTBox"); @@ -46,6 +36,14 @@ class Yolov3LossOp : public framework::OperatorWithKernel { auto anchors = ctx->Attrs().Get>("anchors"); auto box_num = ctx->Attrs().Get("box_num"); auto class_num = ctx->Attrs().Get("class_num"); + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], + "Input(X) dim[3] and dim[4] should be euqal."); + PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); PADDLE_ENFORCE_GT(img_height, 0, "Attr(img_height) value should be greater then 0"); PADDLE_ENFORCE_GT(anchors.size(), 0, @@ -56,14 +54,9 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Attr(box_num) should be an integer greater then 0."); PADDLE_ENFORCE_GT(class_num, 0, "Attr(class_num) should be an integer greater then 0."); - PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), - "Input(X) dim[1] should be equal to (anchor_number * (5 " - "+ class_num))."); - PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); - PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); - std::vector dim_out({dim_x[0], 1}); - ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); + std::vector dim_out({1}); + ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); } protected: @@ -80,12 +73,31 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of [N, C, H, W]"); - AddOutput("Out", - "The output yolo loss tensor, " - "This is a 2-D tensor with shape of [N, 1]"); + AddInput( + "GTBox", + "The input tensor of ground truth boxes, " + "This is a 3-D tensor with shape of [N, max_box_num, 5 + class_num], " + "max_box_num is the max number of boxes in each image, " + "class_num is the number of classes in data set. " + "In the third dimention, stores x, y, w, h, confidence, classes " + "one-hot key. " + "x, y is the center cordinate of boxes and w, h is the width and " + "height, " + "and all of them should be divided by input image height to scale to " + "[0, 1]."); + AddOutput("Loss", + "The output yolov3 loss tensor, " + "This is a 1-D tensor with shape of [1]"); AddAttr("box_num", "The number of boxes generated in each grid."); AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair."); + AddAttr("img_height", + "The input image height after crop of yolov3 network."); + AddAttr("ignore_thresh", + "The ignore threshold to ignore confidence loss."); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground truth boxes. @@ -100,8 +112,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null"); auto dim_x = ctx->GetInputDim("X"); if (ctx->HasOutput(framework::GradVarName("X"))) { ctx->SetOutputDim(framework::GradVarName("X"), dim_x); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index 7950390567..a796a57809 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -44,8 +44,16 @@ static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, auto x_t = EigenVector::Flatten(x); auto y_t = EigenVector::Flatten(y); auto mask_t = EigenVector::Flatten(mask); - auto result = ((x_t - y_t) * mask_t).pow(2).sum().eval(); - return result(0); + + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += pow(x_t(i) - y_t(i), 2); + points += 1; + } + } + return (error_sum / points); } template @@ -55,27 +63,24 @@ static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, auto y_t = EigenVector::Flatten(y); auto mask_t = EigenVector::Flatten(mask); - auto result = - ((y_t * (x_t.log()) + (1.0 - y_t) * ((1.0 - x_t).log())) * mask_t) - .sum() - .eval(); - return result; -} - -template -static inline T CalcCEWithMask(const Tensor& x, const Tensor& y, - const Tensor& mask) { - auto x_t = EigenVector::Flatten(x); - auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += + -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i))); + points += 1; + } + } + return (error_sum / points); } template -static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, - Tensor* pred_confs, Tensor* pred_classes, - Tensor* pred_x, Tensor* pred_y, Tensor* pred_w, - Tensor* pred_h, std::vector anchors, - const int class_num, const int stride) { +static void CalcPredResult(const Tensor& input, Tensor* pred_confs, + Tensor* pred_classes, Tensor* pred_x, Tensor* pred_y, + Tensor* pred_w, Tensor* pred_h, + std::vector anchors, const int class_num, + const int stride) { const int n = input.dims()[0]; const int c = input.dims()[1]; const int h = input.dims()[2]; @@ -84,7 +89,7 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, const int box_attr_num = 5 + class_num; auto input_t = EigenTensor::From(input); - auto pred_boxes_t = EigenTensor::From(*pred_boxes); + // auto pred_boxes_t = EigenTensor::From(*pred_boxes); auto pred_confs_t = EigenTensor::From(*pred_confs); auto pred_classes_t = EigenTensor::From(*pred_classes); auto pred_x_t = EigenTensor::From(*pred_x); @@ -104,16 +109,16 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, pred_y_t(i, an_idx, j, k) = sigmod(input_t(i, box_attr_num * an_idx + 1, j, k)); pred_w_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 2, j, k)); + input_t(i, box_attr_num * an_idx + 2, j, k); pred_h_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 3, j, k)); + input_t(i, box_attr_num * an_idx + 3, j, k); - pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; - pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; - pred_boxes_t(i, an_idx, j, k, 2) = - exp(pred_w_t(i, an_idx, j, k)) * an_w; - pred_boxes_t(i, an_idx, j, k, 3) = - exp(pred_h_t(i, an_idx, j, k)) * an_h; + // pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; + // pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; + // pred_boxes_t(i, an_idx, j, k, 2) = + // exp(pred_w_t(i, an_idx, j, k)) * an_w; + // pred_boxes_t(i, an_idx, j, k, 3) = + // exp(pred_h_t(i, an_idx, j, k)) * an_h; pred_confs_t(i, an_idx, j, k) = sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); @@ -129,40 +134,27 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, } template -static T CalcBoxIoU(std::vector box1, std::vector box2, - bool center_mode) { - T b1_x1, b1_x2, b1_y1, b1_y2; - T b2_x1, b2_x2, b2_y1, b2_y2; - if (center_mode) { - b1_x1 = box1[0] - box1[2] / 2; - b1_x2 = box1[0] + box1[2] / 2; - b1_y1 = box1[1] - box1[3] / 2; - b1_y2 = box1[1] + box1[3] / 2; - b2_x1 = box2[0] - box2[2] / 2; - b2_x2 = box2[0] + box2[2] / 2; - b2_y1 = box2[1] - box2[3] / 2; - b2_y2 = box2[1] + box2[3] / 2; - } else { - b1_x1 = box1[0]; - b1_x2 = box1[1]; - b1_y1 = box1[2]; - b1_y2 = box1[3]; - b2_x1 = box2[0]; - b2_x2 = box2[0]; - b2_y1 = box2[1]; - b2_y2 = box2[1]; - } - T b1_area = (b1_x2 - b1_x1 + 1.0) * (b1_y2 - b1_y1 + 1.0); - T b2_area = (b2_x2 - b2_x1 + 1.0) * (b2_y2 - b2_y1 + 1.0); +static T CalcBoxIoU(std::vector box1, std::vector box2) { + T b1_x1 = box1[0] - box1[2] / 2; + T b1_x2 = box1[0] + box1[2] / 2; + T b1_y1 = box1[1] - box1[3] / 2; + T b1_y2 = box1[1] + box1[3] / 2; + T b2_x1 = box2[0] - box2[2] / 2; + T b2_x2 = box2[0] + box2[2] / 2; + T b2_y1 = box2[1] - box2[3] / 2; + T b2_y2 = box2[1] + box2[3] / 2; + + T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1); + T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1); T inter_rect_x1 = std::max(b1_x1, b2_x1); T inter_rect_y1 = std::max(b1_y1, b2_y1); T inter_rect_x2 = std::min(b1_x2, b2_x2); T inter_rect_y2 = std::min(b1_y2, b2_y2); - T inter_area = std::max(inter_rect_x2 - inter_rect_x1 + 1.0, 0.0) * - std::max(inter_rect_y2 - inter_rect_y1 + 1.0, 0.0); + T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast(0.0)) * + std::max(inter_rect_y2 - inter_rect_y1, static_cast(0.0)); - return inter_area / (b1_area + b2_area - inter_area + 1e-16); + return inter_area / (b1_area + b2_area - inter_area); } template @@ -181,23 +173,18 @@ static inline int GetPredLabel(const Tensor& pred_classes, int n, } template -static void CalcPredBoxWithGTBox( - const Tensor& pred_boxes, const Tensor& pred_confs, - const Tensor& pred_classes, const Tensor& gt_boxes, - std::vector anchors, const float ignore_thresh, const int img_height, - int* gt_num, int* correct_num, Tensor* mask_true, Tensor* mask_false, - Tensor* tx, Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, - Tensor* tclass) { +static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, + std::vector anchors, const int img_height, + const int grid_size, Tensor* obj_mask, + Tensor* noobj_mask, Tensor* tx, Tensor* ty, + Tensor* tw, Tensor* th, Tensor* tconf, + Tensor* tclass) { const int n = gt_boxes.dims()[0]; const int b = gt_boxes.dims()[1]; - const int grid_size = pred_boxes.dims()[1]; const int anchor_num = anchors.size() / 2; - auto pred_boxes_t = EigenTensor::From(pred_boxes); - auto pred_confs_t = EigenTensor::From(pred_confs); - auto pred_classes_t = EigenTensor::From(pred_classes); auto gt_boxes_t = EigenTensor::From(gt_boxes); - auto mask_true_t = EigenTensor::From(*mask_true).setConstant(0.0); - auto mask_false_t = EigenTensor::From(*mask_false).setConstant(1.0); + auto obj_mask_t = EigenTensor::From(*obj_mask).setConstant(0); + auto noobj_mask_t = EigenTensor::From(*noobj_mask).setConstant(1); auto tx_t = EigenTensor::From(*tx).setConstant(0.0); auto ty_t = EigenTensor::From(*ty).setConstant(0.0); auto tw_t = EigenTensor::From(*tw).setConstant(0.0); @@ -205,8 +192,6 @@ static void CalcPredBoxWithGTBox( auto tconf_t = EigenTensor::From(*tconf).setConstant(0.0); auto tclass_t = EigenTensor::From(*tclass).setConstant(0.0); - *gt_num = 0; - *correct_num = 0; for (int i = 0; i < n; i++) { for (int j = 0; j < b; j++) { if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && @@ -214,12 +199,11 @@ static void CalcPredBoxWithGTBox( continue; } - *(gt_num)++; int gt_label = gt_boxes_t(i, j, 0); - T gx = gt_boxes_t(i, j, 1); - T gy = gt_boxes_t(i, j, 2); - T gw = gt_boxes_t(i, j, 3); - T gh = gt_boxes_t(i, j, 4); + T gx = gt_boxes_t(i, j, 1) * grid_size; + T gy = gt_boxes_t(i, j, 2) * grid_size; + T gw = gt_boxes_t(i, j, 3) * grid_size; + T gh = gt_boxes_t(i, j, 4) * grid_size; int gi = static_cast(gx); int gj = static_cast(gy); @@ -230,43 +214,26 @@ static void CalcPredBoxWithGTBox( for (int an_idx = 0; an_idx < anchor_num; an_idx++) { std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), static_cast(anchors[2 * an_idx + 1])}); - iou = CalcBoxIoU(gt_box, anchor_shape, false); + iou = CalcBoxIoU(gt_box, anchor_shape); if (iou > max_iou) { max_iou = iou; best_an_index = an_idx; } if (iou > ignore_thresh) { - mask_false_t(b, an_idx, gj, gi) = 0; + noobj_mask_t(b, an_idx, gj, gi) = 0; } } - mask_true_t(b, best_an_index, gj, gi) = 1; - mask_false_t(b, best_an_index, gj, gi) = 1; + obj_mask_t(b, best_an_index, gj, gi) = 1; + noobj_mask_t(b, best_an_index, gj, gi) = 1; tx_t(i, best_an_index, gj, gi) = gx - gi; ty_t(i, best_an_index, gj, gi) = gy - gj; - tw_t(i, best_an_index, gj, gi) = - log(gw / anchors[2 * best_an_index] + 1e-16); - th_t(i, best_an_index, gj, gi) = - log(gh / anchors[2 * best_an_index + 1] + 1e-16); + tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); + th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); tclass_t(b, best_an_index, gj, gi, gt_label) = 1; tconf_t(b, best_an_index, gj, gi) = 1; - - std::vector pred_box({ - pred_boxes_t(i, best_an_index, gj, gi, 0), - pred_boxes_t(i, best_an_index, gj, gi, 1), - pred_boxes_t(i, best_an_index, gj, gi, 2), - pred_boxes_t(i, best_an_index, gj, gi, 3), - }); - gt_box[0] = gx; - gt_box[1] = gy; - iou = CalcBoxIoU(gt_box, pred_box, true); - int pred_label = GetPredLabel(pred_classes, i, best_an_index, gj, gi); - T score = pred_confs_t(i, best_an_index, gj, gi); - if (iou > 0.5 && pred_label == gt_label && score > 0.5) { - (*correct_num)++; - } } } - mask_false_t = mask_true_t - mask_false_t; + noobj_mask_t = noobj_mask_t - obj_mask_t; } template @@ -275,7 +242,7 @@ class Yolov3LossKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); auto* gt_boxes = ctx.Input("GTBox"); - auto* output = ctx.Output("Out"); + auto* loss = ctx.Output("Loss"); int img_height = ctx.Attr("img_height"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); @@ -286,44 +253,44 @@ class Yolov3LossKernel : public framework::OpKernel { const int h = input->dims()[2]; const int w = input->dims()[3]; const int an_num = anchors.size() / 2; - const float stride = static_cast(img_height) / h; + const T stride = static_cast(img_height) / h; Tensor pred_x, pred_y, pred_w, pred_h; - Tensor pred_boxes, pred_confs, pred_classes; + Tensor pred_confs, pred_classes; pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_boxes.mutable_data({n, an_num, h, w, 4}, ctx.GetPlace()); pred_confs.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_classes.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - CalcPredResult(*input, &pred_boxes, &pred_confs, &pred_classes, &pred_x, - &pred_y, &pred_w, &pred_h, anchors, class_num, stride); + CalcPredResult(*input, &pred_confs, &pred_classes, &pred_x, &pred_y, + &pred_w, &pred_h, anchors, class_num, stride); - Tensor mask_true, mask_false; + Tensor obj_mask, noobj_mask; Tensor tx, ty, tw, th, tconf, tclass; - mask_true.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - mask_false.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - int gt_num = 0; - int correct_num = 0; - CalcPredBoxWithGTBox(pred_boxes, pred_confs, pred_classes, *gt_boxes, - anchors, ignore_thresh, img_height, >_num, - &correct_num, &mask_true, &mask_false, &tx, &ty, - &tw, &th, &tconf, &tclass); - - T loss_x = CalcMSEWithMask(pred_x, tx, mask_true); - T loss_y = CalcMSEWithMask(pred_y, ty, mask_true); - T loss_w = CalcMSEWithMask(pred_w, tw, mask_true); - T loss_h = CalcMSEWithMask(pred_h, th, mask_true); - T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, mask_true); - T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, mask_false); - // T loss_class = CalcCEWithMask() + PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, img_height, h, + &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, + &tclass); + + T loss_x = CalcMSEWithMask(pred_x, tx, obj_mask); + T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); + T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); + T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); + T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, obj_mask); + T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask); + + auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); + loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_true + + loss_conf_false + loss_class; } }; From 36c46152e140adab7e74eaeee9dbeccb65fc5633 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 11 Nov 2018 23:52:36 +0800 Subject: [PATCH 010/719] Add unittest for yolov3_loss. test=develop --- paddle/fluid/operators/yolov3_loss_op.cc | 25 +-- paddle/fluid/operators/yolov3_loss_op.h | 67 +++--- python/paddle/fluid/layers/nn.py | 28 +++ .../tests/unittests/test_yolov3_loss_op.py | 194 ++++++++++++++++++ 4 files changed, 273 insertions(+), 41 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index 9ed7e13dc7..7369ce31e8 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -34,7 +34,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { auto dim_gt = ctx->GetInputDim("GTBox"); auto img_height = ctx->Attrs().Get("img_height"); auto anchors = ctx->Attrs().Get>("anchors"); - auto box_num = ctx->Attrs().Get("box_num"); auto class_num = ctx->Attrs().Get("class_num"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], @@ -50,8 +49,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Attr(anchors) length should be greater then 0."); PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, "Attr(anchors) length should be even integer."); - PADDLE_ENFORCE_GT(box_num, 0, - "Attr(box_num) should be an integer greater then 0."); PADDLE_ENFORCE_GT(class_num, 0, "Attr(class_num) should be an integer greater then 0."); @@ -73,23 +70,19 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of [N, C, H, W]"); - AddInput( - "GTBox", - "The input tensor of ground truth boxes, " - "This is a 3-D tensor with shape of [N, max_box_num, 5 + class_num], " - "max_box_num is the max number of boxes in each image, " - "class_num is the number of classes in data set. " - "In the third dimention, stores x, y, w, h, confidence, classes " - "one-hot key. " - "x, y is the center cordinate of boxes and w, h is the width and " - "height, " - "and all of them should be divided by input image height to scale to " - "[0, 1]."); + AddInput("GTBox", + "The input tensor of ground truth boxes, " + "This is a 3-D tensor with shape of [N, max_box_num, 5], " + "max_box_num is the max number of boxes in each image, " + "In the third dimention, stores label, x, y, w, h, " + "label is an integer to specify box class, x, y is the " + "center cordinate of boxes and w, h is the width and height" + "and x, y, w, h should be divided by input image height to " + "scale to [0, 1]."); AddOutput("Loss", "The output yolov3 loss tensor, " "This is a 1-D tensor with shape of [1]"); - AddAttr("box_num", "The number of boxes generated in each grid."); AddAttr("class_num", "The number of classes to predict."); AddAttr>("anchors", "The anchor width and height, " diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index a796a57809..426e0688ab 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -25,8 +25,7 @@ template using EigenVector = framework::EigenVector; -using Array2 = Eigen::DSizes; -using Array4 = Eigen::DSizes; +using Array5 = Eigen::DSizes; template static inline bool isZero(T x) { @@ -43,7 +42,7 @@ static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { auto x_t = EigenVector::Flatten(x); auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); + auto mask_t = EigenVector::Flatten(mask); T error_sum = 0.0; T points = 0.0; @@ -61,7 +60,7 @@ static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { auto x_t = EigenVector::Flatten(x); auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); + auto mask_t = EigenVector::Flatten(mask); T error_sum = 0.0; T points = 0.0; @@ -89,7 +88,6 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_confs, const int box_attr_num = 5 + class_num; auto input_t = EigenTensor::From(input); - // auto pred_boxes_t = EigenTensor::From(*pred_boxes); auto pred_confs_t = EigenTensor::From(*pred_confs); auto pred_classes_t = EigenTensor::From(*pred_classes); auto pred_x_t = EigenTensor::From(*pred_x); @@ -113,13 +111,6 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_confs, pred_h_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx + 3, j, k); - // pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; - // pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; - // pred_boxes_t(i, an_idx, j, k, 2) = - // exp(pred_w_t(i, an_idx, j, k)) * an_w; - // pred_boxes_t(i, an_idx, j, k, 3) = - // exp(pred_h_t(i, an_idx, j, k)) * an_h; - pred_confs_t(i, an_idx, j, k) = sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); @@ -199,7 +190,7 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, continue; } - int gt_label = gt_boxes_t(i, j, 0); + int gt_label = static_cast(gt_boxes_t(i, j, 0)); T gx = gt_boxes_t(i, j, 1) * grid_size; T gy = gt_boxes_t(i, j, 2) * grid_size; T gw = gt_boxes_t(i, j, 3) * grid_size; @@ -207,7 +198,7 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, int gi = static_cast(gx); int gj = static_cast(gy); - T max_iou = static_cast(-1); + T max_iou = static_cast(0); T iou; int best_an_index = -1; std::vector gt_box({0, 0, gw, gh}); @@ -220,20 +211,33 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, best_an_index = an_idx; } if (iou > ignore_thresh) { - noobj_mask_t(b, an_idx, gj, gi) = 0; + noobj_mask_t(i, an_idx, gj, gi) = 0; } } - obj_mask_t(b, best_an_index, gj, gi) = 1; - noobj_mask_t(b, best_an_index, gj, gi) = 1; + obj_mask_t(i, best_an_index, gj, gi) = 1; + noobj_mask_t(i, best_an_index, gj, gi) = 0; tx_t(i, best_an_index, gj, gi) = gx - gi; ty_t(i, best_an_index, gj, gi) = gy - gj; tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); - tclass_t(b, best_an_index, gj, gi, gt_label) = 1; - tconf_t(b, best_an_index, gj, gi) = 1; + tclass_t(i, best_an_index, gj, gi, gt_label) = 1; + tconf_t(i, best_an_index, gj, gi) = 1; } } - noobj_mask_t = noobj_mask_t - obj_mask_t; +} + +static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand, + const Tensor& obj_mask) { + const int n = obj_mask_expand->dims()[0]; + const int an_num = obj_mask_expand->dims()[1]; + const int h = obj_mask_expand->dims()[2]; + const int w = obj_mask_expand->dims()[3]; + const int class_num = obj_mask_expand->dims()[4]; + auto obj_mask_expand_t = EigenTensor::From(*obj_mask_expand); + auto obj_mask_t = EigenTensor::From(obj_mask); + + obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1)) + .broadcast(Array5(1, 1, 1, 1, class_num)); } template @@ -280,17 +284,30 @@ class Yolov3LossKernel : public framework::OpKernel { &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + T loss_x = CalcMSEWithMask(pred_x, tx, obj_mask); T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); - T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, obj_mask); - T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, noobj_mask); - T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask); + T loss_conf_obj = CalcBCEWithMask(pred_confs, tconf, obj_mask); + T loss_conf_noobj = CalcBCEWithMask(pred_confs, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask_expand); + + // LOG(ERROR) << "loss_x: " << loss_x; + // LOG(ERROR) << "loss_y: " << loss_y; + // LOG(ERROR) << "loss_w: " << loss_w; + // LOG(ERROR) << "loss_h: " << loss_h; + // LOG(ERROR) << "loss_conf_obj: " << loss_conf_obj; + // LOG(ERROR) << "loss_conf_noobj: " << loss_conf_noobj; + // LOG(ERROR) << "loss_class: " << loss_class; auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); - loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_true + - loss_conf_false + loss_class; + loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_obj + + loss_conf_noobj + loss_class; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d3623464e9..1ee7198f29 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -164,6 +164,7 @@ __all__ = [ 'hash', 'grid_sampler', 'log_loss', + 'yolov3_loss', 'add_position_encoding', 'bilinear_tensor_product', ] @@ -8243,6 +8244,33 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss +def yolov3_loss(x, gtbox, img_height, anchors, ignore_thresh, name=None): + """ + **YOLOv3 Loss Layer** + + This layer + """ + helper = LayerHelper('yolov3_loss', **locals()) + + if name is None: + loss = helper.create_variable_for_type_inference(dtype=x.dtype) + else: + loss = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type='yolov3_loss', + inputs={'X': x, + "GTBox": gtbox}, + outputs={'Loss': loss}, + attrs={ + "img_height": img_height, + "anchors": anchors, + "ignore_thresh": ignore_thresh, + }) + return loss + + def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py new file mode 100644 index 0000000000..f5b15efb27 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -0,0 +1,194 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) + + +def mse(x, y, num): + return ((y - x)**2).sum() / num + + +def bce(x, y, mask): + x = x.reshape((-1)) + y = y.reshape((-1)) + mask = mask.reshape((-1)) + + error_sum = 0.0 + count = 0 + for i in range(x.shape[0]): + if mask[i] > 0: + error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i]) + count += 1 + return error_sum / (-1.0 * count) + + +def box_iou(box1, box2): + b1_x1 = box1[0] - box1[2] / 2 + b1_x2 = box1[0] + box1[2] / 2 + b1_y1 = box1[1] - box1[3] / 2 + b1_y2 = box1[1] + box1[3] / 2 + b2_x1 = box2[0] - box2[2] / 2 + b2_x2 = box2[0] + box2[2] / 2 + b2_y1 = box2[1] - box2[3] / 2 + b2_y2 = box2[1] + box2[3] / 2 + + b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) + b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + + inter_rect_x1 = max(b1_x1, b2_x1) + inter_rect_y1 = max(b1_y1, b2_y1) + inter_rect_x2 = min(b1_x2, b2_x2) + inter_rect_y2 = min(b1_y2, b2_y2) + inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max( + inter_rect_y2 - inter_rect_y1, 0) + + return inter_area / (b1_area + b2_area + inter_area) + + +def build_target(gtboxs, attrs, grid_size): + n, b, _ = gtboxs.shape + ignore_thresh = attrs["ignore_thresh"] + img_height = attrs["img_height"] + anchors = attrs["anchors"] + class_num = attrs["class_num"] + an_num = len(anchors) / 2 + obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32') + tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tcls = np.zeros( + (n, an_num, grid_size, grid_size, class_num)).astype('float32') + + for i in range(n): + for j in range(b): + if gtboxs[i, j, :].sum() == 0: + continue + + gt_label = int(gtboxs[i, j, 0]) + gx = gtboxs[i, j, 1] * grid_size + gy = gtboxs[i, j, 2] * grid_size + gw = gtboxs[i, j, 3] * grid_size + gh = gtboxs[i, j, 4] * grid_size + + gi = int(gx) + gj = int(gy) + + gtbox = [0, 0, gw, gh] + max_iou = 0 + for k in range(an_num): + anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]] + iou = box_iou(gtbox, anchor_box) + if iou > max_iou: + max_iou = iou + best_an_index = k + if iou > ignore_thresh: + noobj_mask[i, best_an_index, gj, gi] = 0 + + obj_mask[i, best_an_index, gj, gi] = 1 + noobj_mask[i, best_an_index, gj, gi] = 0 + tx[i, best_an_index, gj, gi] = gx - gi + ty[i, best_an_index, gj, gi] = gy - gj + tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 * + best_an_index]) + th[i, best_an_index, gj, gi] = np.log( + gh / anchors[2 * best_an_index + 1]) + tconf[i, best_an_index, gj, gi] = 1 + tcls[i, best_an_index, gj, gi, gt_label] = 1 + + return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask) + + +def YoloV3Loss(x, gtbox, attrs): + n, c, h, w = x.shape + an_num = len(attrs['anchors']) / 2 + class_num = attrs["class_num"] + x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) + pred_x = sigmoid(x[:, :, :, :, 0]) + pred_y = sigmoid(x[:, :, :, :, 1]) + pred_w = x[:, :, :, :, 2] + pred_h = x[:, :, :, :, 3] + pred_conf = sigmoid(x[:, :, :, :, 4]) + pred_cls = sigmoid(x[:, :, :, :, 5:]) + + tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target( + gtbox, attrs, x.shape[2]) + + obj_mask_expand = np.tile( + np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num']))) + loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum()) + loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum()) + loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum()) + loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum()) + loss_conf_obj = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) + loss_conf_noobj = bce(pred_conf * noobj_mask, tconf * noobj_mask, + noobj_mask) + loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, + obj_mask_expand) + # print "loss_x: ", loss_x + # print "loss_y: ", loss_y + # print "loss_w: ", loss_w + # print "loss_h: ", loss_h + # print "loss_conf_obj: ", loss_conf_obj + # print "loss_conf_noobj: ", loss_conf_noobj + # print "loss_class: ", loss_class + + return loss_x + loss_y + loss_w + loss_h + loss_conf_obj + loss_conf_noobj + loss_class + + +class TestYolov3LossOp(OpTest): + def setUp(self): + self.initTestCase() + self.op_type = 'yolov3_loss' + x = np.random.random(size=self.x_shape).astype('float32') + gtbox = np.random.random(size=self.gtbox_shape).astype('float32') + gtbox[:, :, 0] = np.random.randint(0, self.class_num, + self.gtbox_shape[:2]) + + self.attrs = { + "img_height": self.img_height, + "anchors": self.anchors, + "class_num": self.class_num, + "ignore_thresh": self.ignore_thresh, + } + + self.inputs = {'X': x, 'GTBox': gtbox} + self.outputs = {'Loss': np.array([YoloV3Loss(x, gtbox, self.attrs)])} + print self.outputs + + def test_check_output(self): + self.check_output(atol=1e-3) + + # def test_check_grad_normal(self): + # self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61) + + def initTestCase(self): + self.img_height = 608 + self.anchors = [10, 13, 16, 30, 33, 23] + self.class_num = 10 + self.ignore_thresh = 0.5 + self.x_shape = (5, len(self.anchors) / 2 * (5 + self.class_num), 7, 7) + self.gtbox_shape = (5, 10, 5) + + +if __name__ == "__main__": + unittest.main() From b0afdc4e7d57b2122da6484421fde65a10e4c783 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 13 Nov 2018 15:59:34 +0800 Subject: [PATCH 011/719] Add CMake deps --- paddle/fluid/operators/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 919ad96f7a..5e421803c3 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -269,6 +269,7 @@ else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() op_library(hash_op DEPS xxhash) +op_library(fused_hash_embedding_seq_pool DEPS xxhash) op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) From 32ebee9f077956046a310d6fe3ad194650f579fa Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 13 Nov 2018 16:05:06 +0800 Subject: [PATCH 012/719] Polish code --- paddle/fluid/operators/fused_embedding_seq_pool_op.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused_embedding_seq_pool_op.h index f37c688395..38dfae8ad6 100644 --- a/paddle/fluid/operators/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused_embedding_seq_pool_op.h @@ -48,9 +48,8 @@ struct EmbeddingVSumFunctor { auto blas = math::GetBlas(context); for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i] * ids_count; for (int64_t j = 0; j != ids_count; ++j) { - size_t begin = ids_lod[i] * ids_count; - PADDLE_ENFORCE_LT(ids[begin], row_number); PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); blas.VCOPY(row_width, table + ids[begin + j] * row_width, @@ -114,10 +113,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto lod = ids->lod()[0]; int64_t row_width = d_output->dims()[1]; - framework::Vector new_rows; - new_rows.resize(ids_num); - std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); - d_table->set_rows(new_rows); + framework::Vector *new_rows = d_table->mutable_rows(); + new_rows->resize(ids_num); + std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t)); auto *d_table_value = d_table->mutable_value(); d_table_value->Resize({ids_num, table_dim[1]}); From a0284f6fbcb4888e1653b7f094db615f1437943c Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 12 Nov 2018 21:13:25 +0800 Subject: [PATCH 013/719] Add backward CPU kernel. test=develop --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/yolov3_loss_op.cc | 64 ++++- paddle/fluid/operators/yolov3_loss_op.cu | 4 +- paddle/fluid/operators/yolov3_loss_op.h | 256 +++++++++++++----- python/paddle/fluid/layers/nn.py | 49 +++- .../fluid/tests/unittests/test_layers.py | 9 + .../tests/unittests/test_yolov3_loss_op.py | 42 +-- 7 files changed, 327 insertions(+), 98 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index de32a5d5a2..8344a913e9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -183,6 +183,7 @@ paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', ' paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index 7369ce31e8..cf25e99505 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -20,8 +20,6 @@ using framework::Tensor; class Yolov3LossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of Yolov3LossOp should not be null."); @@ -32,7 +30,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { auto dim_x = ctx->GetInputDim("X"); auto dim_gt = ctx->GetInputDim("GTBox"); - auto img_height = ctx->Attrs().Get("img_height"); auto anchors = ctx->Attrs().Get>("anchors"); auto class_num = ctx->Attrs().Get("class_num"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); @@ -43,8 +40,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "+ class_num))."); PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); - PADDLE_ENFORCE_GT(img_height, 0, - "Attr(img_height) value should be greater then 0"); PADDLE_ENFORCE_GT(anchors.size(), 0, "Attr(anchors) length should be greater then 0."); PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, @@ -87,13 +82,43 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>("anchors", "The anchor width and height, " "it will be parsed pair by pair."); - AddAttr("img_height", - "The input image height after crop of yolov3 network."); AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss."); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground truth boxes. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, specify the grid size, each grid point predict given + number boxes, this given number is specified by anchors, it should be + half anchors length, which following will be represented as S. In the + second dimention(the channel dimention), C should be S * (class_num + 5), + class_num is the box categoriy number of source dataset(such as coco), + so in the second dimention, stores 4 box location coordinates x, y, w, h + and confidence score of the box and class one-hot key of each anchor box. + + While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions + correspnd to: + + $$ + b_x = \sigma(t_x) + c_x + b_y = \sigma(t_y) + c_y + b_w = p_w e^{t_w} + b_h = p_h e^{t_h} + $$ + + While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$ + is specified by anchors. + + As for confidence score, it is the logistic regression value of IoU between + anchor boxes and ground truth boxes, the score of the anchor box which has + the max IoU should be 1, and if the anchor box has IoU bigger then ignore + thresh, the confidence score loss of this anchor box will be ignored. + + Therefore, the yolov3 loss consist of three major parts, box location loss, + confidence score loss, and classification loss. The MSE loss is used for + box location, and binary cross entropy loss is used for confidence score + loss and classification loss. )DOC"); } }; @@ -101,8 +126,6 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { class Yolov3LossOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), @@ -113,6 +136,7 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { } } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( @@ -120,12 +144,32 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { } }; +class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("yolov3_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("GTBox", Input("GTBox")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("GTBox"), {}); + return std::unique_ptr(op); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::Yolov3LossGradMaker); REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); REGISTER_OP_CPU_KERNEL( yolov3_loss, diff --git a/paddle/fluid/operators/yolov3_loss_op.cu b/paddle/fluid/operators/yolov3_loss_op.cu index 48f997456a..f901b10d38 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cu +++ b/paddle/fluid/operators/yolov3_loss_op.cu @@ -17,7 +17,7 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( yolov3_loss, - ops::Yolov3LossOpKernel); + ops::Yolov3LossKernel); REGISTER_OP_CUDA_KERNEL( yolov3_loss_grad, - ops::Yolov3LossGradOpKernel); + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index 426e0688ab..a2ed4440a7 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -33,10 +33,22 @@ static inline bool isZero(T x) { } template -static inline T sigmod(T x) { +static inline T sigmoid(T x) { return 1.0 / (exp(-1.0 * x) + 1.0); } +template +static inline T CalcMaskPointNum(const Tensor& mask) { + auto mask_t = EigenVector::Flatten(mask); + T count = 0.0; + for (int i = 0; i < mask_t.dimensions()[0]; i++) { + if (mask_t(i)) { + count += 1.0; + } + } + return count; +} + template static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { @@ -55,6 +67,21 @@ static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, return (error_sum / points); } +template +static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y, + const Tensor& mask, T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf; + } + } +} + template static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { @@ -75,21 +102,34 @@ static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, } template -static void CalcPredResult(const Tensor& input, Tensor* pred_confs, - Tensor* pred_classes, Tensor* pred_x, Tensor* pred_y, - Tensor* pred_w, Tensor* pred_h, - std::vector anchors, const int class_num, - const int stride) { +static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x, + const Tensor& y, const Tensor& mask, + T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf; + } + } +} + +template +static void CalcPredResult(const Tensor& input, Tensor* pred_conf, + Tensor* pred_class, Tensor* pred_x, Tensor* pred_y, + Tensor* pred_w, Tensor* pred_h, const int anchor_num, + const int class_num) { const int n = input.dims()[0]; - const int c = input.dims()[1]; const int h = input.dims()[2]; const int w = input.dims()[3]; - const int anchor_num = anchors.size() / 2; const int box_attr_num = 5 + class_num; auto input_t = EigenTensor::From(input); - auto pred_confs_t = EigenTensor::From(*pred_confs); - auto pred_classes_t = EigenTensor::From(*pred_classes); + auto pred_conf_t = EigenTensor::From(*pred_conf); + auto pred_class_t = EigenTensor::From(*pred_class); auto pred_x_t = EigenTensor::From(*pred_x); auto pred_y_t = EigenTensor::From(*pred_y); auto pred_w_t = EigenTensor::From(*pred_w); @@ -97,26 +137,23 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_confs, for (int i = 0; i < n; i++) { for (int an_idx = 0; an_idx < anchor_num; an_idx++) { - float an_w = anchors[an_idx * 2] / stride; - float an_h = anchors[an_idx * 2 + 1] / stride; - for (int j = 0; j < h; j++) { for (int k = 0; k < w; k++) { pred_x_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx, j, k)); + sigmoid(input_t(i, box_attr_num * an_idx, j, k)); pred_y_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 1, j, k)); + sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k)); pred_w_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx + 2, j, k); pred_h_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx + 3, j, k); - pred_confs_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); + pred_conf_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k)); for (int c = 0; c < class_num; c++) { - pred_classes_t(i, an_idx, j, k, c) = - sigmod(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); + pred_class_t(i, an_idx, j, k, c) = + sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); } } } @@ -148,27 +185,11 @@ static T CalcBoxIoU(std::vector box1, std::vector box2) { return inter_area / (b1_area + b2_area - inter_area); } -template -static inline int GetPredLabel(const Tensor& pred_classes, int n, - int best_an_index, int gj, int gi) { - auto pred_classes_t = EigenTensor::From(pred_classes); - T score = 0.0; - int label = -1; - for (int i = 0; i < pred_classes.dims()[4]; i++) { - if (pred_classes_t(n, best_an_index, gj, gi, i) > score) { - score = pred_classes_t(n, best_an_index, gj, gi, i); - label = i; - } - } - return label; -} - template static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, - std::vector anchors, const int img_height, - const int grid_size, Tensor* obj_mask, - Tensor* noobj_mask, Tensor* tx, Tensor* ty, - Tensor* tw, Tensor* th, Tensor* tconf, + std::vector anchors, const int grid_size, + Tensor* obj_mask, Tensor* noobj_mask, Tensor* tx, + Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, Tensor* tclass) { const int n = gt_boxes.dims()[0]; const int b = gt_boxes.dims()[1]; @@ -240,6 +261,61 @@ static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand, .broadcast(Array5(1, 1, 1, 1, class_num)); } +template +static void AddAllGradToInputGrad( + Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y, + const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, + const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, + const Tensor& grad_conf_obj, const Tensor& grad_conf_noobj, + const Tensor& grad_class, const int class_num) { + const int n = pred_x.dims()[0]; + const int an_num = pred_x.dims()[1]; + const int h = pred_x.dims()[2]; + const int w = pred_x.dims()[3]; + const int attr_num = class_num + 5; + auto grad_t = EigenTensor::From(*grad).setConstant(0.0); + auto pred_x_t = EigenTensor::From(pred_x); + auto pred_y_t = EigenTensor::From(pred_y); + auto pred_conf_t = EigenTensor::From(pred_conf); + auto pred_class_t = EigenTensor::From(pred_class); + auto grad_x_t = EigenTensor::From(grad_x); + auto grad_y_t = EigenTensor::From(grad_y); + auto grad_w_t = EigenTensor::From(grad_w); + auto grad_h_t = EigenTensor::From(grad_h); + auto grad_conf_obj_t = EigenTensor::From(grad_conf_obj); + auto grad_conf_noobj_t = EigenTensor::From(grad_conf_noobj); + auto grad_class_t = EigenTensor::From(grad_class); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + grad_t(i, j * attr_num, k, l) = grad_x_t(i, j, k, l) * + pred_x_t(i, j, k, l) * + (1.0 - pred_x_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num + 1, k, l) = + grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * + (1.0 - pred_y_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num + 2, k, l) = grad_w_t(i, j, k, l) * loss; + grad_t(i, j * attr_num + 3, k, l) = grad_h_t(i, j, k, l) * loss; + grad_t(i, j * attr_num + 4, k, l) = + grad_conf_obj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num + 4, k, l) += + grad_conf_noobj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss; + + for (int c = 0; c < class_num; c++) { + grad_t(i, j * attr_num + 5 + c, k, l) = + grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * + (1.0 - pred_class_t(i, j, k, l, c)) * loss; + } + } + } + } + } +} + template class Yolov3LossKernel : public framework::OpKernel { public: @@ -247,28 +323,25 @@ class Yolov3LossKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* gt_boxes = ctx.Input("GTBox"); auto* loss = ctx.Output("Loss"); - int img_height = ctx.Attr("img_height"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); const int n = input->dims()[0]; - const int c = input->dims()[1]; const int h = input->dims()[2]; const int w = input->dims()[3]; const int an_num = anchors.size() / 2; - const T stride = static_cast(img_height) / h; Tensor pred_x, pred_y, pred_w, pred_h; - Tensor pred_confs, pred_classes; + Tensor pred_conf, pred_class; pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_confs.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_classes.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - CalcPredResult(*input, &pred_confs, &pred_classes, &pred_x, &pred_y, - &pred_w, &pred_h, anchors, class_num, stride); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); Tensor obj_mask, noobj_mask; Tensor tx, ty, tw, th, tconf, tclass; @@ -280,9 +353,8 @@ class Yolov3LossKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, img_height, h, - &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, - &tclass); + PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, @@ -293,17 +365,9 @@ class Yolov3LossKernel : public framework::OpKernel { T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); - T loss_conf_obj = CalcBCEWithMask(pred_confs, tconf, obj_mask); - T loss_conf_noobj = CalcBCEWithMask(pred_confs, tconf, noobj_mask); - T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask_expand); - - // LOG(ERROR) << "loss_x: " << loss_x; - // LOG(ERROR) << "loss_y: " << loss_y; - // LOG(ERROR) << "loss_w: " << loss_w; - // LOG(ERROR) << "loss_h: " << loss_h; - // LOG(ERROR) << "loss_conf_obj: " << loss_conf_obj; - // LOG(ERROR) << "loss_conf_noobj: " << loss_conf_noobj; - // LOG(ERROR) << "loss_class: " << loss_class; + T loss_conf_obj = CalcBCEWithMask(pred_conf, tconf, obj_mask); + T loss_conf_noobj = CalcBCEWithMask(pred_conf, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_obj + @@ -315,8 +379,76 @@ template class Yolov3LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_input_t = ctx.Output(framework::GradVarName("X")); - auto* d_output_t = ctx.Input(framework::GradVarName("Out")); + auto* input = ctx.Input("X"); + auto* gt_boxes = ctx.Input("GTBox"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Loss")); + const T loss = output_grad->data()[0]; + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_conf, pred_class; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); + + Tensor obj_mask, noobj_mask; + Tensor tx, ty, tw, th, tconf, tclass; + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + + Tensor grad_x, grad_y, grad_w, grad_h; + Tensor grad_conf_obj, grad_conf_noobj, grad_class; + grad_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_obj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_noobj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + T obj_mf = CalcMaskPointNum(obj_mask); + T noobj_mf = CalcMaskPointNum(noobj_mask); + T obj_expand_mf = CalcMaskPointNum(obj_mask_expand); + CalcMSEGradWithMask(&grad_x, pred_x, tx, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_y, pred_y, ty, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_w, pred_w, tw, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_h, pred_h, th, obj_mask, obj_mf); + CalcBCEGradWithMask(&grad_conf_obj, pred_conf, tconf, obj_mask, obj_mf); + CalcBCEGradWithMask(&grad_conf_noobj, pred_conf, tconf, noobj_mask, + noobj_mf); + CalcBCEGradWithMask(&grad_class, pred_class, tclass, obj_mask_expand, + obj_expand_mf); + + input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); + AddAllGradToInputGrad( + input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, + grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 1ee7198f29..a4efb16682 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -8244,14 +8244,55 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss -def yolov3_loss(x, gtbox, img_height, anchors, ignore_thresh, name=None): +@templatedoc(op_type="yolov3_loss") +def yolov3_loss(x, gtbox, anchors, class_num, ignore_thresh, name=None): """ - **YOLOv3 Loss Layer** + ${comment} + + Args: + x (Variable): ${x_comment} + gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], + in the third dimenstion, class_id, x, y, w, h should + be stored and x, y, w, h should be relative valud of + input image. + anchors (list|tuple): ${anchors_comment} + class_num (int): ${class_num_comment} + ignore_thresh (float): ${ignore_thresh_comment} + name (string): the name of yolov3 loss - This layer + Returns: + Variable: A 1-D tensor with shape [1], the value of yolov3 loss + + Raises: + TypeError: Input x of yolov3_loss must be Variable + TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Attr anchors of yolov3_loss must be list or tuple + TypeError: Attr class_num of yolov3_loss must be an integer + TypeError: Attr ignore_thresh of yolov3_loss must be a float number + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') + anchors = [10, 13, 16, 30, 33, 23] + loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 + anchors=anchors, ignore_thresh=0.5) """ helper = LayerHelper('yolov3_loss', **locals()) + if not isinstance(x, Variable): + raise TypeError("Input x of yolov3_loss must be Variable") + if not isinstance(gtbox, Variable): + raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(anchors, list) and not isinstance(anchors, tuple): + raise TypeError("Attr anchors of yolov3_loss must be list or tuple") + if not isinstance(class_num, int): + raise TypeError("Attr class_num of yolov3_loss must be an integer") + if not isinstance(ignore_thresh, float): + raise TypeError( + "Attr ignore_thresh of yolov3_loss must be a float number") + if name is None: loss = helper.create_variable_for_type_inference(dtype=x.dtype) else: @@ -8264,8 +8305,8 @@ def yolov3_loss(x, gtbox, img_height, anchors, ignore_thresh, name=None): "GTBox": gtbox}, outputs={'Loss': loss}, attrs={ - "img_height": img_height, "anchors": anchors, + "class_num": class_num, "ignore_thresh": ignore_thresh, }) return loss diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f48d9c84f9..dd02968c30 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -911,6 +911,15 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(data_1) print(str(program)) + def test_yolov3_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') + gtbox = layers.data(name='gtbox', shape=[10, 5], dtype='float32') + loss = layers.yolov3_loss(x, gtbox, [10, 13, 30, 13], 10, 0.5) + + self.assertIsNotNone(loss) + def test_bilinear_tensor_product_layer(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index f5b15efb27..4562f8bd49 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import division + import unittest import numpy as np from op_test import OpTest +from paddle.fluid import core + def sigmoid(x): return 1.0 / (1.0 + np.exp(-1.0 * x)) @@ -65,10 +69,9 @@ def box_iou(box1, box2): def build_target(gtboxs, attrs, grid_size): n, b, _ = gtboxs.shape ignore_thresh = attrs["ignore_thresh"] - img_height = attrs["img_height"] anchors = attrs["anchors"] class_num = attrs["class_num"] - an_num = len(anchors) / 2 + an_num = len(anchors) // 2 obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32') tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') @@ -120,7 +123,7 @@ def build_target(gtboxs, attrs, grid_size): def YoloV3Loss(x, gtbox, attrs): n, c, h, w = x.shape - an_num = len(attrs['anchors']) / 2 + an_num = len(attrs['anchors']) // 2 class_num = attrs["class_num"] x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) pred_x = sigmoid(x[:, :, :, :, 0]) @@ -144,13 +147,6 @@ def YoloV3Loss(x, gtbox, attrs): noobj_mask) loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, obj_mask_expand) - # print "loss_x: ", loss_x - # print "loss_y: ", loss_y - # print "loss_w: ", loss_w - # print "loss_h: ", loss_h - # print "loss_conf_obj: ", loss_conf_obj - # print "loss_conf_noobj: ", loss_conf_noobj - # print "loss_class: ", loss_class return loss_x + loss_y + loss_w + loss_h + loss_conf_obj + loss_conf_noobj + loss_class @@ -165,29 +161,35 @@ class TestYolov3LossOp(OpTest): self.gtbox_shape[:2]) self.attrs = { - "img_height": self.img_height, "anchors": self.anchors, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, } self.inputs = {'X': x, 'GTBox': gtbox} - self.outputs = {'Loss': np.array([YoloV3Loss(x, gtbox, self.attrs)])} - print self.outputs + self.outputs = { + 'Loss': + np.array([YoloV3Loss(x, gtbox, self.attrs)]).astype('float32') + } def test_check_output(self): - self.check_output(atol=1e-3) + place = core.CPUPlace() + self.check_output_with_place(place, atol=1e-3) - # def test_check_grad_normal(self): - # self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61) + def test_check_grad_ignore_gtbox(self): + place = core.CPUPlace() + self.check_grad_with_place( + place, ['X'], + 'Loss', + no_grad_set=set("GTBox"), + max_relative_error=0.1) def initTestCase(self): - self.img_height = 608 - self.anchors = [10, 13, 16, 30, 33, 23] + self.anchors = [10, 13, 12, 12] self.class_num = 10 self.ignore_thresh = 0.5 - self.x_shape = (5, len(self.anchors) / 2 * (5 + self.class_num), 7, 7) - self.gtbox_shape = (5, 10, 5) + self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) + self.gtbox_shape = (5, 5, 5) if __name__ == "__main__": From 2faa2b4048d14e24acd3f8a3f8c55c2f492d0285 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 13 Nov 2018 20:08:54 +0800 Subject: [PATCH 014/719] remove cu file. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/yolov3_loss_op.cc | 36 ++++++- paddle/fluid/operators/yolov3_loss_op.cu | 23 ----- paddle/fluid/operators/yolov3_loss_op.h | 43 +++++--- python/paddle/fluid/layers/detection.py | 98 +++++++++++++++++++ python/paddle/fluid/layers/nn.py | 69 ------------- .../tests/unittests/test_yolov3_loss_op.py | 23 ++++- 7 files changed, 182 insertions(+), 112 deletions(-) delete mode 100644 paddle/fluid/operators/yolov3_loss_op.cu diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8344a913e9..7e0d5e6088 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -183,7 +183,6 @@ paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', ' paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) @@ -289,6 +288,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'lambda_xy', 'lambda_wh', 'lambda_conf_obj', 'lambda_conf_noobj', 'lambda_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index cf25e99505..f6c134e1b4 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -55,7 +55,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); } }; @@ -63,8 +64,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "The input tensor of bilinear interpolation, " - "This is a 4-D tensor with shape of [N, C, H, W]"); + "The input tensor of YOLO v3 loss operator, " + "This is a 4-D tensor with shape of [N, C, H, W]." + "H and W should be same, and the second dimention(C) stores" + "box locations, confidence score and classification one-hot" + "key of each anchor box"); AddInput("GTBox", "The input tensor of ground truth boxes, " "This is a 3-D tensor with shape of [N, max_box_num, 5], " @@ -84,6 +88,20 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "it will be parsed pair by pair."); AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss."); + AddAttr("lambda_xy", "The weight of x, y location loss.") + .SetDefault(1.0); + AddAttr("lambda_wh", "The weight of w, h location loss.") + .SetDefault(1.0); + AddAttr( + "lambda_conf_obj", + "The weight of confidence score loss in locations with target object.") + .SetDefault(1.0); + AddAttr("lambda_conf_noobj", + "The weight of confidence score loss in locations without " + "target object.") + .SetDefault(1.0); + AddAttr("lambda_class", "The weight of classification loss.") + .SetDefault(1.0); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground truth boxes. @@ -119,6 +137,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { confidence score loss, and classification loss. The MSE loss is used for box location, and binary cross entropy loss is used for confidence score loss and classification loss. + + Final loss will be represented as follow. + + $$ + loss = \lambda_{xy} * loss_{xy} + \lambda_{wh} * loss_{wh} + + \lambda_{conf_obj} * loss_{conf_obj} + + \lambda_{conf_noobj} * loss_{conf_noobj} + + \lambda_{class} * loss_{class} + $$ )DOC"); } }; @@ -140,7 +167,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.cu b/paddle/fluid/operators/yolov3_loss_op.cu deleted file mode 100644 index f901b10d38..0000000000 --- a/paddle/fluid/operators/yolov3_loss_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#define EIGEN_USE_GPU - -#include "paddle/fluid/operators/yolov3_loss_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - yolov3_loss, - ops::Yolov3LossKernel); -REGISTER_OP_CUDA_KERNEL( - yolov3_loss_grad, - ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index a2ed4440a7..f4ede92589 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -267,7 +267,9 @@ static void AddAllGradToInputGrad( const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, const Tensor& grad_conf_obj, const Tensor& grad_conf_noobj, - const Tensor& grad_class, const int class_num) { + const Tensor& grad_class, const int class_num, const float lambda_xy, + const float lambda_wh, const float lambda_conf_obj, + const float lambda_conf_noobj, const float lambda_class) { const int n = pred_x.dims()[0]; const int an_num = pred_x.dims()[1]; const int h = pred_x.dims()[2]; @@ -290,25 +292,27 @@ static void AddAllGradToInputGrad( for (int j = 0; j < an_num; j++) { for (int k = 0; k < h; k++) { for (int l = 0; l < w; l++) { - grad_t(i, j * attr_num, k, l) = grad_x_t(i, j, k, l) * - pred_x_t(i, j, k, l) * - (1.0 - pred_x_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num, k, l) = + grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) * + (1.0 - pred_x_t(i, j, k, l)) * loss * lambda_xy; grad_t(i, j * attr_num + 1, k, l) = grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * - (1.0 - pred_y_t(i, j, k, l)) * loss; - grad_t(i, j * attr_num + 2, k, l) = grad_w_t(i, j, k, l) * loss; - grad_t(i, j * attr_num + 3, k, l) = grad_h_t(i, j, k, l) * loss; + (1.0 - pred_y_t(i, j, k, l)) * loss * lambda_xy; + grad_t(i, j * attr_num + 2, k, l) = + grad_w_t(i, j, k, l) * loss * lambda_wh; + grad_t(i, j * attr_num + 3, k, l) = + grad_h_t(i, j, k, l) * loss * lambda_wh; grad_t(i, j * attr_num + 4, k, l) = grad_conf_obj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss; + (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_obj; grad_t(i, j * attr_num + 4, k, l) += grad_conf_noobj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss; + (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_noobj; for (int c = 0; c < class_num; c++) { grad_t(i, j * attr_num + 5 + c, k, l) = grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * - (1.0 - pred_class_t(i, j, k, l, c)) * loss; + (1.0 - pred_class_t(i, j, k, l, c)) * loss * lambda_class; } } } @@ -326,6 +330,11 @@ class Yolov3LossKernel : public framework::OpKernel { auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); + float lambda_xy = ctx.Attr("lambda_xy"); + float lambda_wh = ctx.Attr("lambda_wh"); + float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); + float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); + float lambda_class = ctx.Attr("lambda_class"); const int n = input->dims()[0]; const int h = input->dims()[2]; @@ -370,8 +379,10 @@ class Yolov3LossKernel : public framework::OpKernel { T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); - loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_obj + - loss_conf_noobj + loss_class; + loss_data[0] = + lambda_xy * (loss_x + loss_y) + lambda_wh * (loss_w + loss_h) + + lambda_conf_obj * loss_conf_obj + lambda_conf_noobj * loss_conf_noobj + + lambda_class * loss_class; } }; @@ -387,6 +398,11 @@ class Yolov3LossGradKernel : public framework::OpKernel { auto* input_grad = ctx.Output(framework::GradVarName("X")); auto* output_grad = ctx.Input(framework::GradVarName("Loss")); const T loss = output_grad->data()[0]; + float lambda_xy = ctx.Attr("lambda_xy"); + float lambda_wh = ctx.Attr("lambda_wh"); + float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); + float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); + float lambda_class = ctx.Attr("lambda_class"); const int n = input->dims()[0]; const int c = input->dims()[1]; @@ -448,7 +464,8 @@ class Yolov3LossGradKernel : public framework::OpKernel { input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); AddAllGradToInputGrad( input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, - grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num); + grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num, + lambda_xy, lambda_wh, lambda_conf_obj, lambda_conf_noobj, lambda_class); } }; diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 4ac94981a7..2bb9514803 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -20,6 +20,7 @@ from __future__ import print_function from .layer_function_generator import generate_layer_fn from .layer_function_generator import autodoc, templatedoc from ..layer_helper import LayerHelper +from ..framework import Variable from . import tensor from . import nn from . import ops @@ -45,6 +46,7 @@ __all__ = [ 'iou_similarity', 'box_coder', 'polygon_box_transform', + 'yolov3_loss', ] @@ -404,6 +406,102 @@ def polygon_box_transform(input, name=None): return output +@templatedoc(op_type="yolov3_loss") +def yolov3_loss(x, + gtbox, + anchors, + class_num, + ignore_thresh, + lambda_xy=None, + lambda_wh=None, + lambda_conf_obj=None, + lambda_conf_noobj=None, + lambda_class=None, + name=None): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], + in the third dimenstion, class_id, x, y, w, h should + be stored and x, y, w, h should be relative valud of + input image. + anchors (list|tuple): ${anchors_comment} + class_num (int): ${class_num_comment} + ignore_thresh (float): ${ignore_thresh_comment} + lambda_xy (float|None): ${lambda_xy_comment} + lambda_wh (float|None): ${lambda_wh_comment} + lambda_conf_obj (float|None): ${lambda_conf_obj_comment} + lambda_conf_noobj (float|None): ${lambda_conf_noobj_comment} + lambda_class (float|None): ${lambda_class_comment} + name (string): the name of yolov3 loss + + Returns: + Variable: A 1-D tensor with shape [1], the value of yolov3 loss + + Raises: + TypeError: Input x of yolov3_loss must be Variable + TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Attr anchors of yolov3_loss must be list or tuple + TypeError: Attr class_num of yolov3_loss must be an integer + TypeError: Attr ignore_thresh of yolov3_loss must be a float number + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') + anchors = [10, 13, 16, 30, 33, 23] + loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 + anchors=anchors, ignore_thresh=0.5) + """ + helper = LayerHelper('yolov3_loss', **locals()) + + if not isinstance(x, Variable): + raise TypeError("Input x of yolov3_loss must be Variable") + if not isinstance(gtbox, Variable): + raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(anchors, list) and not isinstance(anchors, tuple): + raise TypeError("Attr anchors of yolov3_loss must be list or tuple") + if not isinstance(class_num, int): + raise TypeError("Attr class_num of yolov3_loss must be an integer") + if not isinstance(ignore_thresh, float): + raise TypeError( + "Attr ignore_thresh of yolov3_loss must be a float number") + + if name is None: + loss = helper.create_variable_for_type_inference(dtype=x.dtype) + else: + loss = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + attrs = { + "anchors": anchors, + "class_num": class_num, + "ignore_thresh": ignore_thresh, + } + + if lambda_xy is not None and isinstance(lambda_xy, float): + self.attrs['lambda_xy'] = lambda_xy + if lambda_wh is not None and isinstance(lambda_wh, float): + self.attrs['lambda_wh'] = lambda_wh + if lambda_conf_obj is not None and isinstance(lambda_conf_obj, float): + self.attrs['lambda_conf_obj'] = lambda_conf_obj + if lambda_conf_noobj is not None and isinstance(lambda_conf_noobj, float): + self.attrs['lambda_conf_noobj'] = lambda_conf_noobj + if lambda_class is not None and isinstance(lambda_class, float): + self.attrs['lambda_class'] = lambda_class + + helper.append_op( + type='yolov3_loss', + inputs={'X': x, + "GTBox": gtbox}, + outputs={'Loss': loss}, + attrs=attrs) + return loss + + @templatedoc() def detection_map(detect_res, label, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a4efb16682..d3623464e9 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -164,7 +164,6 @@ __all__ = [ 'hash', 'grid_sampler', 'log_loss', - 'yolov3_loss', 'add_position_encoding', 'bilinear_tensor_product', ] @@ -8244,74 +8243,6 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss -@templatedoc(op_type="yolov3_loss") -def yolov3_loss(x, gtbox, anchors, class_num, ignore_thresh, name=None): - """ - ${comment} - - Args: - x (Variable): ${x_comment} - gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], - in the third dimenstion, class_id, x, y, w, h should - be stored and x, y, w, h should be relative valud of - input image. - anchors (list|tuple): ${anchors_comment} - class_num (int): ${class_num_comment} - ignore_thresh (float): ${ignore_thresh_comment} - name (string): the name of yolov3 loss - - Returns: - Variable: A 1-D tensor with shape [1], the value of yolov3 loss - - Raises: - TypeError: Input x of yolov3_loss must be Variable - TypeError: Input gtbox of yolov3_loss must be Variable" - TypeError: Attr anchors of yolov3_loss must be list or tuple - TypeError: Attr class_num of yolov3_loss must be an integer - TypeError: Attr ignore_thresh of yolov3_loss must be a float number - - Examples: - .. code-block:: python - - x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') - gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') - anchors = [10, 13, 16, 30, 33, 23] - loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 - anchors=anchors, ignore_thresh=0.5) - """ - helper = LayerHelper('yolov3_loss', **locals()) - - if not isinstance(x, Variable): - raise TypeError("Input x of yolov3_loss must be Variable") - if not isinstance(gtbox, Variable): - raise TypeError("Input gtbox of yolov3_loss must be Variable") - if not isinstance(anchors, list) and not isinstance(anchors, tuple): - raise TypeError("Attr anchors of yolov3_loss must be list or tuple") - if not isinstance(class_num, int): - raise TypeError("Attr class_num of yolov3_loss must be an integer") - if not isinstance(ignore_thresh, float): - raise TypeError( - "Attr ignore_thresh of yolov3_loss must be a float number") - - if name is None: - loss = helper.create_variable_for_type_inference(dtype=x.dtype) - else: - loss = helper.create_variable( - name=name, dtype=x.dtype, persistable=False) - - helper.append_op( - type='yolov3_loss', - inputs={'X': x, - "GTBox": gtbox}, - outputs={'Loss': loss}, - attrs={ - "anchors": anchors, - "class_num": class_num, - "ignore_thresh": ignore_thresh, - }) - return loss - - def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 4562f8bd49..3b6d58563f 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -148,11 +148,20 @@ def YoloV3Loss(x, gtbox, attrs): loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, obj_mask_expand) - return loss_x + loss_y + loss_w + loss_h + loss_conf_obj + loss_conf_noobj + loss_class + return attrs['lambda_xy'] * (loss_x + loss_y) \ + + attrs['lambda_wh'] * (loss_w + loss_h) \ + + attrs['lambda_conf_obj'] * loss_conf_obj \ + + attrs['lambda_conf_noobj'] * loss_conf_noobj \ + + attrs['lambda_class'] * loss_class class TestYolov3LossOp(OpTest): def setUp(self): + self.lambda_xy = 1.0 + self.lambda_wh = 1.0 + self.lambda_conf_obj = 1.0 + self.lambda_conf_noobj = 1.0 + self.lambda_class = 1.0 self.initTestCase() self.op_type = 'yolov3_loss' x = np.random.random(size=self.x_shape).astype('float32') @@ -164,6 +173,11 @@ class TestYolov3LossOp(OpTest): "anchors": self.anchors, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, + "lambda_xy": self.lambda_xy, + "lambda_wh": self.lambda_wh, + "lambda_conf_obj": self.lambda_conf_obj, + "lambda_conf_noobj": self.lambda_conf_noobj, + "lambda_class": self.lambda_class, } self.inputs = {'X': x, 'GTBox': gtbox} @@ -182,7 +196,7 @@ class TestYolov3LossOp(OpTest): place, ['X'], 'Loss', no_grad_set=set("GTBox"), - max_relative_error=0.1) + max_relative_error=0.06) def initTestCase(self): self.anchors = [10, 13, 12, 12] @@ -190,6 +204,11 @@ class TestYolov3LossOp(OpTest): self.ignore_thresh = 0.5 self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) self.gtbox_shape = (5, 5, 5) + self.lambda_xy = 2.5 + self.lambda_wh = 0.8 + self.lambda_conf_obj = 1.5 + self.lambda_conf_noobj = 0.5 + self.lambda_class = 1.2 if __name__ == "__main__": From 95d5060dddcbfd0eff8cb50d542f5adb6899b6b6 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 15 Nov 2018 18:57:49 +0800 Subject: [PATCH 015/719] fix abs -> fabs error. test=develop --- paddle/fluid/operators/yolov3_loss_op.h | 13 +++++++------ .../fluid/tests/unittests/test_yolov3_loss_op.py | 14 +++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index f4ede92589..608ef3f94b 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -29,7 +29,7 @@ using Array5 = Eigen::DSizes; template static inline bool isZero(T x) { - return abs(x) < 1e-6; + return fabs(x) < 1e-6; } template @@ -186,7 +186,7 @@ static T CalcBoxIoU(std::vector box1, std::vector box2) { } template -static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, +static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, std::vector anchors, const int grid_size, Tensor* obj_mask, Tensor* noobj_mask, Tensor* tx, Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, @@ -206,8 +206,9 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, for (int i = 0; i < n; i++) { for (int j = 0; j < b; j++) { - if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && - isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3))) { + if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && + isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3)) && + isZero(gt_boxes_t(i, j, 4))) { continue; } @@ -362,7 +363,7 @@ class Yolov3LossKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; @@ -431,7 +432,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 3b6d58563f..03a64055f0 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -190,13 +190,13 @@ class TestYolov3LossOp(OpTest): place = core.CPUPlace() self.check_output_with_place(place, atol=1e-3) - def test_check_grad_ignore_gtbox(self): - place = core.CPUPlace() - self.check_grad_with_place( - place, ['X'], - 'Loss', - no_grad_set=set("GTBox"), - max_relative_error=0.06) + # def test_check_grad_ignore_gtbox(self): + # place = core.CPUPlace() + # self.check_grad_with_place( + # place, ['X'], + # 'Loss', + # no_grad_set=set("GTBox"), + # max_relative_error=0.06) def initTestCase(self): self.anchors = [10, 13, 12, 12] From f115eb0d1e6ffa1dd65bfcc7b30b419d52f3c68b Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 15 Nov 2018 21:05:28 +0800 Subject: [PATCH 016/719] enhance api. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/yolov3_loss_op.cc | 50 ++++--- paddle/fluid/operators/yolov3_loss_op.h | 129 ++++++++++-------- python/paddle/fluid/layers/detection.py | 67 +++++---- python/paddle/fluid/tests/test_detection.py | 13 ++ .../fluid/tests/unittests/test_layers.py | 9 -- .../tests/unittests/test_yolov3_loss_op.py | 88 ++++++------ 7 files changed, 199 insertions(+), 159 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 7e0d5e6088..1f1dc3757d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -288,7 +288,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'lambda_xy', 'lambda_wh', 'lambda_conf_obj', 'lambda_conf_noobj', 'lambda_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index f6c134e1b4..1d7f482362 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -25,11 +25,14 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Input(X) of Yolov3LossOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("GTBox"), "Input(GTBox) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTLabel"), + "Input(GTLabel) of Yolov3LossOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) of Yolov3LossOp should not be null."); auto dim_x = ctx->GetInputDim("X"); - auto dim_gt = ctx->GetInputDim("GTBox"); + auto dim_gtbox = ctx->GetInputDim("GTBox"); + auto dim_gtlabel = ctx->GetInputDim("GTLabel"); auto anchors = ctx->Attrs().Get>("anchors"); auto class_num = ctx->Attrs().Get("class_num"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); @@ -38,8 +41,15 @@ class Yolov3LossOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), "Input(X) dim[1] should be equal to (anchor_number * (5 " "+ class_num))."); - PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); - PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); + PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3, + "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5"); + PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2, + "Input(GTBox) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0], + "Input(GTBox) and Input(GTLabel) dim[0] should be same"); + PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1], + "Input(GTBox) and Input(GTLabel) dim[1] should be same"); PADDLE_ENFORCE_GT(anchors.size(), 0, "Attr(anchors) length should be greater then 0."); PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, @@ -73,11 +83,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "The input tensor of ground truth boxes, " "This is a 3-D tensor with shape of [N, max_box_num, 5], " "max_box_num is the max number of boxes in each image, " - "In the third dimention, stores label, x, y, w, h, " - "label is an integer to specify box class, x, y is the " - "center cordinate of boxes and w, h is the width and height" - "and x, y, w, h should be divided by input image height to " - "scale to [0, 1]."); + "In the third dimention, stores x, y, w, h coordinates, " + "x, y is the center cordinate of boxes and w, h is the " + "width and height and x, y, w, h should be divided by " + "input image height to scale to [0, 1]."); + AddInput("GTLabel", + "The input tensor of ground truth label, " + "This is a 2-D tensor with shape of [N, max_box_num], " + "and each element shoudl be an integer to indicate the " + "box class id."); AddOutput("Loss", "The output yolov3 loss tensor, " "This is a 1-D tensor with shape of [1]"); @@ -88,19 +102,19 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "it will be parsed pair by pair."); AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss."); - AddAttr("lambda_xy", "The weight of x, y location loss.") + AddAttr("loss_weight_xy", "The weight of x, y location loss.") .SetDefault(1.0); - AddAttr("lambda_wh", "The weight of w, h location loss.") + AddAttr("loss_weight_wh", "The weight of w, h location loss.") .SetDefault(1.0); AddAttr( - "lambda_conf_obj", + "loss_weight_conf_target", "The weight of confidence score loss in locations with target object.") .SetDefault(1.0); - AddAttr("lambda_conf_noobj", + AddAttr("loss_weight_conf_notarget", "The weight of confidence score loss in locations without " "target object.") .SetDefault(1.0); - AddAttr("lambda_class", "The weight of classification loss.") + AddAttr("loss_weight_class", "The weight of classification loss.") .SetDefault(1.0); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground @@ -141,10 +155,10 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { Final loss will be represented as follow. $$ - loss = \lambda_{xy} * loss_{xy} + \lambda_{wh} * loss_{wh} - + \lambda_{conf_obj} * loss_{conf_obj} - + \lambda_{conf_noobj} * loss_{conf_noobj} - + \lambda_{class} * loss_{class} + loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh} + + \loss_weight_{conf_target} * loss_{conf_target} + + \loss_weight_{conf_notarget} * loss_{conf_notarget} + + \loss_weight_{class} * loss_{class} $$ )DOC"); } @@ -182,12 +196,14 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { op->SetType("yolov3_loss_grad"); op->SetInput("X", Input("X")); op->SetInput("GTBox", Input("GTBox")); + op->SetInput("GTLabel", Input("GTLabel")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); op->SetAttrMap(Attrs()); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("GTBox"), {}); + op->SetOutput(framework::GradVarName("GTLabel"), {}); return std::unique_ptr(op); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index 608ef3f94b..a1072aca10 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -186,15 +186,17 @@ static T CalcBoxIoU(std::vector box1, std::vector box2) { } template -static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, - std::vector anchors, const int grid_size, - Tensor* obj_mask, Tensor* noobj_mask, Tensor* tx, - Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, +static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label, + const float ignore_thresh, std::vector anchors, + const int grid_size, Tensor* obj_mask, + Tensor* noobj_mask, Tensor* tx, Tensor* ty, + Tensor* tw, Tensor* th, Tensor* tconf, Tensor* tclass) { - const int n = gt_boxes.dims()[0]; - const int b = gt_boxes.dims()[1]; + const int n = gt_box.dims()[0]; + const int b = gt_box.dims()[1]; const int anchor_num = anchors.size() / 2; - auto gt_boxes_t = EigenTensor::From(gt_boxes); + auto gt_box_t = EigenTensor::From(gt_box); + auto gt_label_t = EigenTensor::From(gt_label); auto obj_mask_t = EigenTensor::From(*obj_mask).setConstant(0); auto noobj_mask_t = EigenTensor::From(*noobj_mask).setConstant(1); auto tx_t = EigenTensor::From(*tx).setConstant(0.0); @@ -206,28 +208,27 @@ static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, for (int i = 0; i < n; i++) { for (int j = 0; j < b; j++) { - if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && - isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3)) && - isZero(gt_boxes_t(i, j, 4))) { + if (isZero(gt_box_t(i, j, 0)) && isZero(gt_box_t(i, j, 1)) && + isZero(gt_box_t(i, j, 2)) && isZero(gt_box_t(i, j, 3))) { continue; } - int gt_label = static_cast(gt_boxes_t(i, j, 0)); - T gx = gt_boxes_t(i, j, 1) * grid_size; - T gy = gt_boxes_t(i, j, 2) * grid_size; - T gw = gt_boxes_t(i, j, 3) * grid_size; - T gh = gt_boxes_t(i, j, 4) * grid_size; + int cur_label = gt_label_t(i, j); + T gx = gt_box_t(i, j, 0) * grid_size; + T gy = gt_box_t(i, j, 1) * grid_size; + T gw = gt_box_t(i, j, 2) * grid_size; + T gh = gt_box_t(i, j, 3) * grid_size; int gi = static_cast(gx); int gj = static_cast(gy); T max_iou = static_cast(0); T iou; int best_an_index = -1; - std::vector gt_box({0, 0, gw, gh}); + std::vector gt_box_shape({0, 0, gw, gh}); for (int an_idx = 0; an_idx < anchor_num; an_idx++) { std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), static_cast(anchors[2 * an_idx + 1])}); - iou = CalcBoxIoU(gt_box, anchor_shape); + iou = CalcBoxIoU(gt_box_shape, anchor_shape); if (iou > max_iou) { max_iou = iou; best_an_index = an_idx; @@ -242,7 +243,7 @@ static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, ty_t(i, best_an_index, gj, gi) = gy - gj; tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); - tclass_t(i, best_an_index, gj, gi, gt_label) = 1; + tclass_t(i, best_an_index, gj, gi, cur_label) = 1; tconf_t(i, best_an_index, gj, gi) = 1; } } @@ -267,10 +268,10 @@ static void AddAllGradToInputGrad( Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y, const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, - const Tensor& grad_conf_obj, const Tensor& grad_conf_noobj, - const Tensor& grad_class, const int class_num, const float lambda_xy, - const float lambda_wh, const float lambda_conf_obj, - const float lambda_conf_noobj, const float lambda_class) { + const Tensor& grad_conf_target, const Tensor& grad_conf_notarget, + const Tensor& grad_class, const int class_num, const float loss_weight_xy, + const float loss_weight_wh, const float loss_weight_conf_target, + const float loss_weight_conf_notarget, const float loss_weight_class) { const int n = pred_x.dims()[0]; const int an_num = pred_x.dims()[1]; const int h = pred_x.dims()[2]; @@ -285,8 +286,8 @@ static void AddAllGradToInputGrad( auto grad_y_t = EigenTensor::From(grad_y); auto grad_w_t = EigenTensor::From(grad_w); auto grad_h_t = EigenTensor::From(grad_h); - auto grad_conf_obj_t = EigenTensor::From(grad_conf_obj); - auto grad_conf_noobj_t = EigenTensor::From(grad_conf_noobj); + auto grad_conf_target_t = EigenTensor::From(grad_conf_target); + auto grad_conf_notarget_t = EigenTensor::From(grad_conf_notarget); auto grad_class_t = EigenTensor::From(grad_class); for (int i = 0; i < n; i++) { @@ -295,25 +296,26 @@ static void AddAllGradToInputGrad( for (int l = 0; l < w; l++) { grad_t(i, j * attr_num, k, l) = grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) * - (1.0 - pred_x_t(i, j, k, l)) * loss * lambda_xy; + (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy; grad_t(i, j * attr_num + 1, k, l) = grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * - (1.0 - pred_y_t(i, j, k, l)) * loss * lambda_xy; + (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy; grad_t(i, j * attr_num + 2, k, l) = - grad_w_t(i, j, k, l) * loss * lambda_wh; + grad_w_t(i, j, k, l) * loss * loss_weight_wh; grad_t(i, j * attr_num + 3, k, l) = - grad_h_t(i, j, k, l) * loss * lambda_wh; + grad_h_t(i, j, k, l) * loss * loss_weight_wh; grad_t(i, j * attr_num + 4, k, l) = - grad_conf_obj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_obj; + grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target; grad_t(i, j * attr_num + 4, k, l) += - grad_conf_noobj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_noobj; + grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * + loss_weight_conf_notarget; for (int c = 0; c < class_num; c++) { grad_t(i, j * attr_num + 5 + c, k, l) = grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * - (1.0 - pred_class_t(i, j, k, l, c)) * loss * lambda_class; + (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class; } } } @@ -326,16 +328,18 @@ class Yolov3LossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); - auto* gt_boxes = ctx.Input("GTBox"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); auto* loss = ctx.Output("Loss"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); - float lambda_xy = ctx.Attr("lambda_xy"); - float lambda_wh = ctx.Attr("lambda_wh"); - float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); - float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); - float lambda_class = ctx.Attr("lambda_class"); + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); const int n = input->dims()[0]; const int h = input->dims()[2]; @@ -363,7 +367,7 @@ class Yolov3LossKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; @@ -375,15 +379,16 @@ class Yolov3LossKernel : public framework::OpKernel { T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); - T loss_conf_obj = CalcBCEWithMask(pred_conf, tconf, obj_mask); - T loss_conf_noobj = CalcBCEWithMask(pred_conf, tconf, noobj_mask); + T loss_conf_target = CalcBCEWithMask(pred_conf, tconf, obj_mask); + T loss_conf_notarget = CalcBCEWithMask(pred_conf, tconf, noobj_mask); T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); - loss_data[0] = - lambda_xy * (loss_x + loss_y) + lambda_wh * (loss_w + loss_h) + - lambda_conf_obj * loss_conf_obj + lambda_conf_noobj * loss_conf_noobj + - lambda_class * loss_class; + loss_data[0] = loss_weight_xy * (loss_x + loss_y) + + loss_weight_wh * (loss_w + loss_h) + + loss_weight_conf_target * loss_conf_target + + loss_weight_conf_notarget * loss_conf_notarget + + loss_weight_class * loss_class; } }; @@ -392,18 +397,20 @@ class Yolov3LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); - auto* gt_boxes = ctx.Input("GTBox"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); auto* input_grad = ctx.Output(framework::GradVarName("X")); auto* output_grad = ctx.Input(framework::GradVarName("Loss")); const T loss = output_grad->data()[0]; - float lambda_xy = ctx.Attr("lambda_xy"); - float lambda_wh = ctx.Attr("lambda_wh"); - float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); - float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); - float lambda_class = ctx.Attr("lambda_class"); + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); const int n = input->dims()[0]; const int c = input->dims()[1]; @@ -432,7 +439,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; @@ -441,13 +448,13 @@ class Yolov3LossGradKernel : public framework::OpKernel { ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); Tensor grad_x, grad_y, grad_w, grad_h; - Tensor grad_conf_obj, grad_conf_noobj, grad_class; + Tensor grad_conf_target, grad_conf_notarget, grad_class; grad_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_conf_obj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_conf_noobj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_target.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_notarget.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); T obj_mf = CalcMaskPointNum(obj_mask); T noobj_mf = CalcMaskPointNum(noobj_mask); @@ -456,8 +463,9 @@ class Yolov3LossGradKernel : public framework::OpKernel { CalcMSEGradWithMask(&grad_y, pred_y, ty, obj_mask, obj_mf); CalcMSEGradWithMask(&grad_w, pred_w, tw, obj_mask, obj_mf); CalcMSEGradWithMask(&grad_h, pred_h, th, obj_mask, obj_mf); - CalcBCEGradWithMask(&grad_conf_obj, pred_conf, tconf, obj_mask, obj_mf); - CalcBCEGradWithMask(&grad_conf_noobj, pred_conf, tconf, noobj_mask, + CalcBCEGradWithMask(&grad_conf_target, pred_conf, tconf, obj_mask, + obj_mf); + CalcBCEGradWithMask(&grad_conf_notarget, pred_conf, tconf, noobj_mask, noobj_mf); CalcBCEGradWithMask(&grad_class, pred_class, tclass, obj_mask_expand, obj_expand_mf); @@ -465,8 +473,9 @@ class Yolov3LossGradKernel : public framework::OpKernel { input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); AddAllGradToInputGrad( input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, - grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num, - lambda_xy, lambda_wh, lambda_conf_obj, lambda_conf_noobj, lambda_class); + grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class, + class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target, + loss_weight_conf_notarget, loss_weight_class); } }; diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 2bb9514803..cab5c3e2a4 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -409,32 +409,36 @@ def polygon_box_transform(input, name=None): @templatedoc(op_type="yolov3_loss") def yolov3_loss(x, gtbox, + gtlabel, anchors, class_num, ignore_thresh, - lambda_xy=None, - lambda_wh=None, - lambda_conf_obj=None, - lambda_conf_noobj=None, - lambda_class=None, + loss_weight_xy=None, + loss_weight_wh=None, + loss_weight_conf_target=None, + loss_weight_conf_notarget=None, + loss_weight_class=None, name=None): """ ${comment} Args: x (Variable): ${x_comment} - gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], - in the third dimenstion, class_id, x, y, w, h should - be stored and x, y, w, h should be relative valud of - input image. + gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4], + in the third dimenstion, x, y, w, h should be stored + and x, y, w, h should be relative value of input image. + N is the batch number and B is the max box number in + an image. + gtlabel (Variable): class id of ground truth boxes, shoud be ins shape + of [N, B]. anchors (list|tuple): ${anchors_comment} class_num (int): ${class_num_comment} ignore_thresh (float): ${ignore_thresh_comment} - lambda_xy (float|None): ${lambda_xy_comment} - lambda_wh (float|None): ${lambda_wh_comment} - lambda_conf_obj (float|None): ${lambda_conf_obj_comment} - lambda_conf_noobj (float|None): ${lambda_conf_noobj_comment} - lambda_class (float|None): ${lambda_class_comment} + loss_weight_xy (float|None): ${loss_weight_xy_comment} + loss_weight_wh (float|None): ${loss_weight_wh_comment} + loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment} + loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment} + loss_weight_class (float|None): ${loss_weight_class_comment} name (string): the name of yolov3 loss Returns: @@ -443,6 +447,7 @@ def yolov3_loss(x, Raises: TypeError: Input x of yolov3_loss must be Variable TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Input gtlabel of yolov3_loss must be Variable" TypeError: Attr anchors of yolov3_loss must be list or tuple TypeError: Attr class_num of yolov3_loss must be an integer TypeError: Attr ignore_thresh of yolov3_loss must be a float number @@ -450,8 +455,9 @@ def yolov3_loss(x, Examples: .. code-block:: python - x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') - gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') + x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') + gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') anchors = [10, 13, 16, 30, 33, 23] loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 anchors=anchors, ignore_thresh=0.5) @@ -462,6 +468,8 @@ def yolov3_loss(x, raise TypeError("Input x of yolov3_loss must be Variable") if not isinstance(gtbox, Variable): raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(gtlabel, Variable): + raise TypeError("Input gtlabel of yolov3_loss must be Variable") if not isinstance(anchors, list) and not isinstance(anchors, tuple): raise TypeError("Attr anchors of yolov3_loss must be list or tuple") if not isinstance(class_num, int): @@ -482,21 +490,24 @@ def yolov3_loss(x, "ignore_thresh": ignore_thresh, } - if lambda_xy is not None and isinstance(lambda_xy, float): - self.attrs['lambda_xy'] = lambda_xy - if lambda_wh is not None and isinstance(lambda_wh, float): - self.attrs['lambda_wh'] = lambda_wh - if lambda_conf_obj is not None and isinstance(lambda_conf_obj, float): - self.attrs['lambda_conf_obj'] = lambda_conf_obj - if lambda_conf_noobj is not None and isinstance(lambda_conf_noobj, float): - self.attrs['lambda_conf_noobj'] = lambda_conf_noobj - if lambda_class is not None and isinstance(lambda_class, float): - self.attrs['lambda_class'] = lambda_class + if loss_weight_xy is not None and isinstance(loss_weight_xy, float): + self.attrs['loss_weight_xy'] = loss_weight_xy + if loss_weight_wh is not None and isinstance(loss_weight_wh, float): + self.attrs['loss_weight_wh'] = loss_weight_wh + if loss_weight_conf_target is not None and isinstance( + loss_weight_conf_target, float): + self.attrs['loss_weight_conf_target'] = loss_weight_conf_target + if loss_weight_conf_notarget is not None and isinstance( + loss_weight_conf_notarget, float): + self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget + if loss_weight_class is not None and isinstance(loss_weight_class, float): + self.attrs['loss_weight_class'] = loss_weight_class helper.append_op( type='yolov3_loss', - inputs={'X': x, - "GTBox": gtbox}, + inputs={"X": x, + "GTBox": gtbox, + "GTLabel": gtlabel}, outputs={'Loss': loss}, attrs=attrs) return loss diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 28dc751957..527fd521d5 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -366,5 +366,18 @@ class TestGenerateProposals(unittest.TestCase): print(rpn_rois.shape) +class TestYoloDetection(unittest.TestCase): + def test_yolov3_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') + gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') + gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') + loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10, + 0.5) + + self.assertIsNotNone(loss) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index dd02968c30..f48d9c84f9 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -911,15 +911,6 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(data_1) print(str(program)) - def test_yolov3_loss(self): - program = Program() - with program_guard(program): - x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') - gtbox = layers.data(name='gtbox', shape=[10, 5], dtype='float32') - loss = layers.yolov3_loss(x, gtbox, [10, 13, 30, 13], 10, 0.5) - - self.assertIsNotNone(loss) - def test_bilinear_tensor_product_layer(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 03a64055f0..335214b298 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -66,7 +66,7 @@ def box_iou(box1, box2): return inter_area / (b1_area + b2_area + inter_area) -def build_target(gtboxs, attrs, grid_size): +def build_target(gtboxs, gtlabel, attrs, grid_size): n, b, _ = gtboxs.shape ignore_thresh = attrs["ignore_thresh"] anchors = attrs["anchors"] @@ -87,11 +87,11 @@ def build_target(gtboxs, attrs, grid_size): if gtboxs[i, j, :].sum() == 0: continue - gt_label = int(gtboxs[i, j, 0]) - gx = gtboxs[i, j, 1] * grid_size - gy = gtboxs[i, j, 2] * grid_size - gw = gtboxs[i, j, 3] * grid_size - gh = gtboxs[i, j, 4] * grid_size + gt_label = gtlabel[i, j] + gx = gtboxs[i, j, 0] * grid_size + gy = gtboxs[i, j, 1] * grid_size + gw = gtboxs[i, j, 2] * grid_size + gh = gtboxs[i, j, 3] * grid_size gi = int(gx) gj = int(gy) @@ -121,7 +121,7 @@ def build_target(gtboxs, attrs, grid_size): return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask) -def YoloV3Loss(x, gtbox, attrs): +def YoloV3Loss(x, gtbox, gtlabel, attrs): n, c, h, w = x.shape an_num = len(attrs['anchors']) // 2 class_num = attrs["class_num"] @@ -134,7 +134,7 @@ def YoloV3Loss(x, gtbox, attrs): pred_cls = sigmoid(x[:, :, :, :, 5:]) tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target( - gtbox, attrs, x.shape[2]) + gtbox, gtlabel, attrs, x.shape[2]) obj_mask_expand = np.tile( np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num']))) @@ -142,73 +142,73 @@ def YoloV3Loss(x, gtbox, attrs): loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum()) loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum()) loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum()) - loss_conf_obj = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) - loss_conf_noobj = bce(pred_conf * noobj_mask, tconf * noobj_mask, - noobj_mask) + loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) + loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask, + noobj_mask) loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, obj_mask_expand) - return attrs['lambda_xy'] * (loss_x + loss_y) \ - + attrs['lambda_wh'] * (loss_w + loss_h) \ - + attrs['lambda_conf_obj'] * loss_conf_obj \ - + attrs['lambda_conf_noobj'] * loss_conf_noobj \ - + attrs['lambda_class'] * loss_class + return attrs['loss_weight_xy'] * (loss_x + loss_y) \ + + attrs['loss_weight_wh'] * (loss_w + loss_h) \ + + attrs['loss_weight_conf_target'] * loss_conf_target \ + + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \ + + attrs['loss_weight_class'] * loss_class class TestYolov3LossOp(OpTest): def setUp(self): - self.lambda_xy = 1.0 - self.lambda_wh = 1.0 - self.lambda_conf_obj = 1.0 - self.lambda_conf_noobj = 1.0 - self.lambda_class = 1.0 + self.loss_weight_xy = 1.0 + self.loss_weight_wh = 1.0 + self.loss_weight_conf_target = 1.0 + self.loss_weight_conf_notarget = 1.0 + self.loss_weight_class = 1.0 self.initTestCase() self.op_type = 'yolov3_loss' x = np.random.random(size=self.x_shape).astype('float32') gtbox = np.random.random(size=self.gtbox_shape).astype('float32') - gtbox[:, :, 0] = np.random.randint(0, self.class_num, - self.gtbox_shape[:2]) + gtlabel = np.random.randint(0, self.class_num, + self.gtbox_shape[:2]).astype('int32') self.attrs = { "anchors": self.anchors, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, - "lambda_xy": self.lambda_xy, - "lambda_wh": self.lambda_wh, - "lambda_conf_obj": self.lambda_conf_obj, - "lambda_conf_noobj": self.lambda_conf_noobj, - "lambda_class": self.lambda_class, + "loss_weight_xy": self.loss_weight_xy, + "loss_weight_wh": self.loss_weight_wh, + "loss_weight_conf_target": self.loss_weight_conf_target, + "loss_weight_conf_notarget": self.loss_weight_conf_notarget, + "loss_weight_class": self.loss_weight_class, } - self.inputs = {'X': x, 'GTBox': gtbox} + self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel} self.outputs = { - 'Loss': - np.array([YoloV3Loss(x, gtbox, self.attrs)]).astype('float32') + 'Loss': np.array( + [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32') } def test_check_output(self): place = core.CPUPlace() self.check_output_with_place(place, atol=1e-3) - # def test_check_grad_ignore_gtbox(self): - # place = core.CPUPlace() - # self.check_grad_with_place( - # place, ['X'], - # 'Loss', - # no_grad_set=set("GTBox"), - # max_relative_error=0.06) + def test_check_grad_ignore_gtbox(self): + place = core.CPUPlace() + self.check_grad_with_place( + place, ['X'], + 'Loss', + no_grad_set=set("GTBox"), + max_relative_error=0.06) def initTestCase(self): self.anchors = [10, 13, 12, 12] self.class_num = 10 self.ignore_thresh = 0.5 self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) - self.gtbox_shape = (5, 5, 5) - self.lambda_xy = 2.5 - self.lambda_wh = 0.8 - self.lambda_conf_obj = 1.5 - self.lambda_conf_noobj = 0.5 - self.lambda_class = 1.2 + self.gtbox_shape = (5, 10, 4) + self.loss_weight_xy = 2.5 + self.loss_weight_wh = 0.8 + self.loss_weight_conf_target = 1.5 + self.loss_weight_conf_notarget = 0.5 + self.loss_weight_class = 1.2 if __name__ == "__main__": From 853878cbf218728608a783260ae74c408ef4b8a2 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Fri, 16 Nov 2018 02:05:56 -0800 Subject: [PATCH 017/719] fix the wrong format test=develop --- python/paddle/fluid/average.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py index 42cd3b3642..40a734af31 100644 --- a/python/paddle/fluid/average.py +++ b/python/paddle/fluid/average.py @@ -48,6 +48,7 @@ class WeightedAverage(object): Examples: .. code-block:: python + avg = fluid.average.WeightedAverage() avg.add(value=2.0, weight=1) avg.add(value=4.0, weight=2) From 8ef6280c034602f776554432672d42b826afbaee Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 16 Nov 2018 19:14:40 +0800 Subject: [PATCH 018/719] Add operator double support. test=develop --- paddle/fluid/operators/yolov3_loss_op.cc | 10 ++++------ paddle/fluid/operators/yolov3_loss_op.h | 4 ++-- .../fluid/tests/unittests/test_yolov3_loss_op.py | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index 1d7f482362..e7597f7324 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -215,9 +215,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, ops::Yolov3LossGradMaker); REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); -REGISTER_OP_CPU_KERNEL( - yolov3_loss, - ops::Yolov3LossKernel); -REGISTER_OP_CPU_KERNEL( - yolov3_loss_grad, - ops::Yolov3LossGradKernel); +REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel, + ops::Yolov3LossKernel); +REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel, + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index a1072aca10..0bb285722d 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -323,7 +323,7 @@ static void AddAllGradToInputGrad( } } -template +template class Yolov3LossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -392,7 +392,7 @@ class Yolov3LossKernel : public framework::OpKernel { } }; -template +template class Yolov3LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 335214b298..544fe4b4f8 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -195,7 +195,7 @@ class TestYolov3LossOp(OpTest): self.check_grad_with_place( place, ['X'], 'Loss', - no_grad_set=set("GTBox"), + no_grad_set=set(["GTBox", "GTLabel"]), max_relative_error=0.06) def initTestCase(self): From 79cec5311179e6e50b0126fea0e6dfa8a7cf354a Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Tue, 20 Nov 2018 12:37:04 +0000 Subject: [PATCH 019/719] add ignore index for sigmoid cross entropy with logits op, test=develop --- .../sigmoid_cross_entropy_with_logits_op.cc | 5 + .../sigmoid_cross_entropy_with_logits_op.h | 93 ++++++++++++++----- python/paddle/fluid/layers/nn.py | 5 +- .../fluid/tests/unittests/test_layers.py | 3 +- ...st_sigmoid_cross_entropy_with_logits_op.py | 35 +++++++ 5 files changed, 116 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 193de05422..d6a2fa6a17 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -100,6 +100,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker AddOutput("Out", "(Tensor, default Tensor), a 2-D tensor with shape N x D " " of elementwise logistic losses."); + AddAttr( + "ignore_index", + "(int, default -1), Specifies a target value that is ignored and" + "does not contribute to the input gradient.") + .SetDefault(-1); AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h index faef72866e..2bfba6f170 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -15,33 +15,82 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/legacy/utils/Logging.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct SigmoidCrossEntropyWithLogitsForward { + // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) + HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = std::log(static_cast(1) + std::exp(-(std::abs(x)))); + return term1 - term2 + term3; + } + + int ignore_index; +}; + +template +struct SigmoidCrossEntropyWithLogitsBackward { + // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) + HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); + return simoid_x - label; + } + + int ignore_index; +}; + // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) template class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - framework::Tensor *Out = context.Output("Out"); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + Tensor *Out = context.Output("Out"); Out->mutable_data(context.GetPlace()); + int ignore_index = context.Attr("ignore_index"); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto out = framework::EigenVector::Flatten(*Out); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto out = EigenVector::Flatten(*Out); auto &place = *context.device_context().eigen_device(); + out.device(place) = x.binaryExpr( + labels, SigmoidCrossEntropyWithLogitsForward(ignore_index)); // term1 = max(x, 0) - auto term1 = x.cwiseMax(static_cast(0)); + // auto term1 = x.cwiseMax(static_cast(0)); // term2 = x * labels - auto term2 = x * labels; + // auto term2 = x * labels; // term3 = log(1 + exp(-abs(x))) - auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); + // auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); - out.device(place) = term1 - term2 + term3; + // out.device(place) = term1 - term2 + term3; } }; @@ -50,23 +99,23 @@ template class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - const framework::Tensor *dOut = - context.Input(framework::GradVarName("Out")); - framework::Tensor *dX = - context.Output(framework::GradVarName("X")); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + Tensor *dX = context.Output(framework::GradVarName("X")); dX->mutable_data(context.GetPlace()); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto dout = framework::EigenVector::Flatten(*dOut); - auto dx = framework::EigenVector::Flatten(*dX); + auto ignore_index = context.Attr("ignore_index"); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto dout = EigenVector::Flatten(*dOut); + auto dx = EigenVector::Flatten(*dX); auto &place = *context.template device_context().eigen_device(); - auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); - dx.device(place) = dout * (sigmoid_x - labels); + auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward( + static_cast(ignore_index))); + dx.device(place) = dout * diff; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 99acd7e308..e032835de3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -7892,13 +7892,14 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): @templatedoc() -def sigmoid_cross_entropy_with_logits(x, label, name=None): +def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-1, name=None): """ ${comment} Args: x(${x_type}): ${x_comment} label(${label_type}): ${label_comment} + ignore_index(&{ignore_index}): ${ignore_index_comment} name(basestring|None): Name of the output. Returns: @@ -7917,7 +7918,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None): type="sigmoid_cross_entropy_with_logits", inputs={"X": x, "Label": label}, - attrs={}, + attrs={"ignore_index": ignore_index}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index a8fa5436c4..8e098e4961 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -170,9 +170,10 @@ class TestBook(unittest.TestCase): with program_guard(program): dat = layers.data(name='data', shape=[10], dtype='float32') lbl = layers.data(name='label', shape=[10], dtype='float32') + ignore_index = -1 self.assertIsNotNone( layers.sigmoid_cross_entropy_with_logits( - x=dat, label=lbl)) + x=dat, label=lbl, ignore_index=-1)) print(str(program)) def test_hsigmoid(self): diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py index 97ff203499..64f6f088e1 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py @@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): """Test sigmoid_cross_entropy_with_logit_op with probabalistic label """ + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + batch_size = 64 + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype("float32")), + 'Label': np.random.randint(-1, 2, (batch_size, num_classes)) + .astype("float32") + } + self.attrs = {'ignore_index': ignore_index, } + # Fw Pass is implemented as elementwise sigmoid followed by + # elementwise logistic loss + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSigmoidCrossEntropyWithLogitsOp3(OpTest): + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ + def setUp(self): self.op_type = "sigmoid_cross_entropy_with_logits" batch_size = 64 @@ -85,3 +119,4 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): if __name__ == '__main__': unittest.main() + np.random.seed(0) From 13e254faedd2c464fa14057d90c66995b2b4f159 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Tue, 20 Nov 2018 13:08:23 +0000 Subject: [PATCH 020/719] refine code, test=develop --- paddle/fluid/API.spec | 2 +- .../operators/sigmoid_cross_entropy_with_logits_op.cc | 4 ++-- .../operators/sigmoid_cross_entropy_with_logits_op.h | 10 ---------- python/paddle/fluid/layers/nn.py | 2 +- python/paddle/fluid/tests/unittests/test_layers.py | 2 +- .../test_sigmoid_cross_entropy_with_logits_op.py | 1 - 6 files changed, 5 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index da8941c351..f84ec4cb3e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -174,7 +174,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index d6a2fa6a17..368988d60d 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -102,9 +102,9 @@ class SigmoidCrossEntropyWithLogitsOpMaker " of elementwise logistic losses."); AddAttr( "ignore_index", - "(int, default -1), Specifies a target value that is ignored and" + "(int, default -100), Specifies a target value that is ignored and" "does not contribute to the input gradient.") - .SetDefault(-1); + .SetDefault(-100); AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h index 2bfba6f170..b8731c2327 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -31,7 +31,6 @@ using EigenMatrix = framework::EigenMatrix; template struct SigmoidCrossEntropyWithLogitsForward { - // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index) : ignore_index(ignore_index) {} @@ -50,7 +49,6 @@ struct SigmoidCrossEntropyWithLogitsForward { template struct SigmoidCrossEntropyWithLogitsBackward { - // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index) : ignore_index(ignore_index) {} @@ -83,14 +81,6 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { out.device(place) = x.binaryExpr( labels, SigmoidCrossEntropyWithLogitsForward(ignore_index)); - // term1 = max(x, 0) - // auto term1 = x.cwiseMax(static_cast(0)); - // term2 = x * labels - // auto term2 = x * labels; - // term3 = log(1 + exp(-abs(x))) - // auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); - - // out.device(place) = term1 - term2 + term3; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e032835de3..38da9173cc 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -7892,7 +7892,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): @templatedoc() -def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-1, name=None): +def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-100, name=None): """ ${comment} diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 8e098e4961..326938e115 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -173,7 +173,7 @@ class TestBook(unittest.TestCase): ignore_index = -1 self.assertIsNotNone( layers.sigmoid_cross_entropy_with_logits( - x=dat, label=lbl, ignore_index=-1)) + x=dat, label=lbl, ignore_index=ignore_index)) print(str(program)) def test_hsigmoid(self): diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py index 64f6f088e1..41797a241c 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py @@ -119,4 +119,3 @@ class TestSigmoidCrossEntropyWithLogitsOp3(OpTest): if __name__ == '__main__': unittest.main() - np.random.seed(0) From 05917c3c7901e00266e4e57c0c043bcc5beecdf4 Mon Sep 17 00:00:00 2001 From: liuhongyu Date: Tue, 27 Nov 2018 15:35:18 +0800 Subject: [PATCH 021/719] add cudnn lstm; test=develop --- paddle/fluid/API.spec | 1 + paddle/fluid/framework/operator.h | 8 + paddle/fluid/operators/cudnn_lstm_op.cc | 204 +++++++++ paddle/fluid/operators/cudnn_lstm_op.cu.cc | 491 +++++++++++++++++++++ paddle/fluid/operators/cudnn_lstm_op.h | 42 ++ paddle/fluid/platform/dynload/cudnn.h | 18 +- python/paddle/fluid/layers/nn.py | 152 +++++++ 7 files changed, 915 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/cudnn_lstm_op.cc create mode 100644 paddle/fluid/operators/cudnn_lstm_op.cu.cc create mode 100644 paddle/fluid/operators/cudnn_lstm_op.h diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 0a71f15343..550c7ddeb4 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -187,6 +187,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.cudnn_lstm ArgSpec(args=['input', 'init_h', 'init_c', 'batch_size', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'fix_seed', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index bfdfdc56b3..06d3ee9e72 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -174,6 +174,14 @@ class ExecutionContext { return op_.Inputs(name).size(); } + const std::string InputVarName(const std::string& name) const { + return op_.Input(name); + } + + const std::string OutputVarName(const std::string& name) const { + return op_.Output(name); + } + size_t OutputSize(const std::string& name) const { return op_.Outputs(name).size(); } diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc new file mode 100644 index 0000000000..cadc5b8830 --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -0,0 +1,204 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cudnn_lstm_op.h" +#include + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CudnnLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(Weight) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(Cache) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_h"), + "Output(last_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_c"), + "Output(last_c) of LSTM should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3."); + + ctx->SetOutputDim("Out", ctx->GetInputDim("Input")); + ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH")); + ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC")); + } +}; + +class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "(Tensor) RNN input tensor, which support variable-time length input " + "sequence." + "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)" + "seq_len is the total time step in this mini-batch (CAN be change in " + "different batch)" + "batch_size is the instance number of this batch" + "input_size is the hidden size of the input." + "input_hidden_size and the hidden_size in the next may not be same"); + AddInput("InitH", + "(Tensor) the initial hidden state of the LSTM" + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("InitC", + "(Tensor) the initial cell state of the LSTm " + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("W", + "(Tensor) the learnable hidden-hidden weights." + " The shape is (N), where N is total weight size of the LSTM. " + " cudnn concatenate all the weight to one Tensor"); + AddInput("Cache", + "The cache of dropout op, a RAW type variable including random " + "number generator states and some descriptors, which is used in " + "cudnn kernel.") + .AsDispensable(); + AddOutput("Out", + "(Tensor) the hidden state of LSTM operator. " + "The shape is ( seq_len x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be ( seq_len x " + "batch_size x hidden_size * 2) "); + AddOutput("last_h", + "(Tensor) the hidden state of the last step. " + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddOutput("last_c", + "(Tensor) the cell state of the last step" + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirect is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size*2)"); + AddAttr("max_len", + "max length of the LSTM op" + "the first dim of the Input can NOT be greater than max_len") + .SetDefault(20); + AddAttr( + "dropout_prob", + "dropout prob of the dropout op" + "the dropout ONLY work between lstm layers, not between time steps" + "There is no dropout work on the Out tensor") + .SetDefault(0.0); + AddAttr("is_bidirec", + "is_bidirec" + "if it is bidirection rnn" + "The will affect the shape of the Out, last_h, and last_c") + .SetDefault(false); + AddAttr("input_size", "input size ot the Input Tensor").SetDefault(10); + AddAttr("batch_size", "the instance number the batch").SetDefault(10); + AddAttr("hidden_size", "hidden size of the LSTM").SetDefault(100); + AddAttr("num_layers", "the total layer number of the LSTM") + .SetDefault(1); + AddAttr("is_test", "True if in test phase.").SetDefault(false); + AddAttr("fix_seed", "True if it fix dropout seed").SetDefault(false); + AddAttr("seed", "seed to used if fix_seed is True").SetDefault(0); + AddComment(R"DOC( +CUDNN LSTM implementation + +A four-gate Long Short-Term Memory network with no peephole connections. +In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, +the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: + +it = σ(Wi X xt + Ri X ht-1 + bWi + bRi) +ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf) +ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo) +c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc) +ct = ft * ct-1 + it * c't +ht = ot * tanh(ct) + +Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, +X represensts a matrix multiplication +and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively. + + +)DOC"); + } +}; + +class CudnnLSTMGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("last_h"), + "Input(last_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("last_c"), + "Input(last_c) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(last_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) { + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + } + }; + + SetOutGradDim("Input"); + SetOutGradDim("W"); + SetOutGradDim("InitH"); + SetOutGradDim("InitC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); + +REGISTER_OP_CPU_KERNEL( + cudnn_lstm, + ops::CudnnLSTMKernel); + +REGISTER_OP_CPU_KERNEL( + cudnn_lstm_grad, + ops::CudnnLSTMGradKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc new file mode 100644 index 0000000000..9caf65b53f --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -0,0 +1,491 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cudnn_lstm_op.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + Tensor reserve_data_; + Tensor workspace_data_; + + Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, + size_t max_len, int batch_size, int input_size, int hidden_size, + int num_layers, float dropout_prob, bool is_bidirec, int seed, + int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = + dropout_state_.mutable_data(ctx.GetPlace()); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(ctx.GetPlace()); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(ctx.GetPlace()); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +template +class CudnnLSTMGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const Tensor *x = ctx.Input("Input"); + const Tensor *init_h = ctx.Input("InitH"); + const Tensor *init_c = ctx.Input("InitC"); + + auto w = ctx.Input("W"); + + Tensor *out = ctx.Output("Out"); + Tensor *last_h = ctx.Output("last_h"); + Tensor *last_c = ctx.Output("last_c"); + + const T *x_data = x->data(); + const T *init_h_data = init_h->data(); + const T *init_c_data = init_c->data(); + + const T *w_data = w->data(); + + T *out_data = out->mutable_data(ctx.GetPlace()); + T *last_h_data = last_h->mutable_data(ctx.GetPlace()); + T *last_c_data = last_c->mutable_data(ctx.GetPlace()); + + size_t max_len = ctx.Attr("max_len"); + float dropout_prob = ctx.Attr("dropout_prob"); + bool is_bidirec = ctx.Attr("is_bidirec"); + int batch_size = ctx.Attr("batch_size"); + int input_size = ctx.Attr("input_size"); + int hidden_size = ctx.Attr("hidden_size"); + int num_layers = ctx.Attr("num_layers"); + bool is_test = ctx.Attr("is_test"); + + /* + if (is_test) { + TensorCopy(*x, ctx.GetPlace(), out); + return; + }*/ + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + if (!cache_var) { + // The RAW type cache variable wouldn't be created and broadcasted on + // multi-devices before the first running. + // use parent scope to make cache persistable + auto *scope = const_cast(ctx.scope().parent()); + auto cache_var_name = ctx.InputVarName("Cache"); + cache_var = scope->Var(cache_var_name); + } + CudnnRNNCache *cudnn_rnn_cache = nullptr; + if (cache_var->IsInitialized()) { + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + } else { + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + std::random_device rnd; + int seed = ctx.Attr("fix_seed") ? ctx.Attr("seed") : rnd(); + + auto input_w_numel = w->numel(); + cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, + hidden_size, num_layers, dropout_prob, is_bidirec, + seed, input_w_numel); + } + + auto run_seq_len = x->dims()[0]; + + if (is_test) { + // for inference + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_)); + } else { + // for train + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, + cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } + } +}; + +template +class CudnnLSTMGPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *input = ctx.Input("Input"); + auto *weight = ctx.Input("W"); + auto *init_h = ctx.Input("InitH"); + auto *init_c = ctx.Input("InitC"); + // auto * last_h = ctx.Input("last_h"); + // auto * last_c = ctx.Input("last_c"); + auto *out = ctx.Input("Out"); + auto *out_grad = ctx.Input(framework::GradVarName("Out")); + auto *last_h_grad = ctx.Input(framework::GradVarName("last_h")); + auto *last_c_grad = ctx.Input(framework::GradVarName("last_c")); + + // auto* init_h = ctx.Input("init_h"); + // auto* init_c = ctx.Input("init_c"); + + auto *in_grad = ctx.Output(framework::GradVarName("Input")); + auto *weight_grad = ctx.Output(framework::GradVarName("W")); + auto *init_h_grad = ctx.Output(framework::GradVarName("InitH")); + auto *init_c_grad = ctx.Output(framework::GradVarName("InitC")); + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + PADDLE_ENFORCE(cache_var->IsInitialized()); + CudnnRNNCache *cudnn_rnn_cache = + const_cast(cache_var) + ->GetMutable(); + + auto input_dims = input->dims(); + auto weight_dims = weight->dims(); + auto init_h_dims = init_h->dims(); + auto init_c_dims = init_c->dims(); + in_grad->mutable_data(ctx.GetPlace()); + weight_grad->mutable_data(ctx.GetPlace()); + math::SetConstant zero; + zero(dev_ctx, in_grad, static_cast(0.0)); + zero(dev_ctx, weight_grad, static_cast(0.0)); + + T *init_h_grad_data = NULL; + if (init_h_grad == nullptr) { + Tensor init_h_grad_temp; + init_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &init_h_grad_temp, static_cast(0.0)); + + init_h_grad_data = init_h_grad_temp.data(); + } else { + init_h_grad->mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, init_h_grad, static_cast(0.0)); + init_h_grad_data = init_h_grad->data(); + } + + T *init_c_grad_data = NULL; + if (init_c_grad == nullptr) { + Tensor init_c_grad_temp; + init_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &init_c_grad_temp, static_cast(0.0)); + + init_c_grad_data = init_c_grad_temp.data(); + } else { + init_c_grad->mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, init_c_grad, static_cast(0.0)); + init_c_grad_data = init_c_grad->data(); + } + + const T *last_h_grad_data = NULL; + if (last_h_grad == nullptr) { + Tensor last_h_grad_temp; + last_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &last_h_grad_temp, static_cast(0.0)); + + last_h_grad_data = (const T *)last_h_grad_temp.data(); + } else { + last_h_grad_data = last_h_grad->data(); + } + + const T *last_c_grad_data = NULL; + if (last_c_grad == nullptr) { + Tensor last_c_grad_temp; + last_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &last_c_grad_temp, static_cast(0.0)); + + last_c_grad_data = (const T *)last_c_grad_temp.data(); + } else { + last_c_grad_data = last_c_grad->data(); + } + + const T *out_grad_data = NULL; + if (out_grad == nullptr) { + Tensor out_grad_temp; + out_grad_temp.mutable_data(out->dims(), ctx.GetPlace()); + zero(dev_ctx, &out_grad_temp, static_cast(0.0)); + + out_grad_data = (const T *)out_grad_temp.data(); + } else { + out_grad_data = out_grad->data(); + } + + // zero( dev_ctx, last_h_grad, static_cast(0.0)); + // zero( dev_ctx, last_c_grad, static_cast(0.0)); + + auto out_data = out->data(); + // auto out_grad_data = out_grad->data(); + auto weight_data = weight->data(); + auto init_h_data = init_h->data(); + auto init_c_data = init_c->data(); + auto in_grad_data = in_grad->data(); + + auto work_data = cudnn_rnn_cache->workspace_data_.data(); + auto reserve_data = cudnn_rnn_cache->reserve_data_.data(); + + auto run_seq_len = input_dims[0]; + PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_, + "cudnn running seq_len CAN not greater max_lengh"); + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_, + out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data, + cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_, + weight_data, cudnn_rnn_cache->hx_desc_, init_h_data, + cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_, + in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data, + cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data, + cudnn_rnn_cache->workspace_size_, reserve_data, + cudnn_rnn_cache->reserve_size_)); + + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, input->data(), cudnn_rnn_cache->hx_desc_, + init_h->data(), cudnn_rnn_cache->y_desc_, out->data(), + cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_, + weight_grad->data(), cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + cudnn_lstm, + ops::CudnnLSTMGPUKernel); +REGISTER_OP_CUDA_KERNEL( + cudnn_lstm_grad, + ops::CudnnLSTMGPUGradKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/cudnn_lstm_op.h new file mode 100644 index 0000000000..fb4b37e46e --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +class CudnnLSTMKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override {} +}; + +template +class CudnnLSTMGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override {} +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index db62377898..213cd8a9ce 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnFindConvolutionForwardAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ - __macro(cudnnGetErrorString); + __macro(cudnnGetErrorString); \ + __macro(cudnnCreateDropoutDescriptor); \ + __macro(cudnnDropoutGetStatesSize); \ + __macro(cudnnSetDropoutDescriptor); \ + __macro(cudnnCreateRNNDescriptor); \ + __macro(cudnnSetRNNDescriptor); \ + __macro(cudnnGetRNNParamsSize); \ + __macro(cudnnGetRNNWorkspaceSize); \ + __macro(cudnnGetRNNTrainingReserveSize); \ + __macro(cudnnRNNForwardTraining); \ + __macro(cudnnRNNBackwardData); \ + __macro(cudnnRNNBackwardWeights); \ + __macro(cudnnRNNForwardInference); \ + __macro(cudnnDestroyDropoutDescriptor); \ + __macro(cudnnDestroyRNNDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); + CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 7af1f380e7..abb82e7505 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,6 +169,7 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', + 'cudnn_lstm', ] @@ -466,6 +467,157 @@ def dynamic_lstm(input, return hidden, cell +def cudnn_lstm(input, + init_h, + init_c, + batch_size, + max_len, + dropout_prob, + input_size, + hidden_size, + num_layers, + is_bidirec=False, + dtype='float32', + is_test=False, + name=None, + default_initializer=None, + fix_seed=False, + seed=0): + """ + CUDNN LSTM implementation + + A four-gate Long Short-Term Memory network with no peephole connections. + In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, + the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: + + it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi) + ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf) + ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo) + c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc) + ct = ft * ct-1 + it * c't + ht = ot * tanh(ct) + + Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, + X represensts a matrix multiplication + and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively. + + + Args: + input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) + init_h(Variable): The initial hidden state of the LSTM + This is a tensor with shape ( num_layers x batch_size x hidden_size) + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) + init_c(Variable): The initial cell state of the LSTM. + This is a tensor with shape ( num_layers x batch_size x hidden_size ) + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) + batch_size (int): total distance numer of the batch + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len + dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps + There is NO dropout work on rnn output of the last RNN layers + input_size (int): hidden size of the input tensor + hidden_size (int): hidden size of the LSTM + num_layers (int): total layers number of the LSTM + is_bidirec (bool): If it is bidirectional + dtype (str): Data type. Choices = ["float32", "float64"], default "float32". + is_test (bool): If it is in test phrase + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + default_initializer(Initialize|None): Where use initializer to initialize the Weight + If set None, defaule initializer will be used + + + Returns: + rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) + if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) + last_h(Tensor): the hidden state of the last step of LSTM + shape is ( num_layers x batch_size x hidden_size ) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + last_c(Tensor): the cell state of the last step of LSTM + shape is ( num_layers x batch_size x hidden_size ) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + + + Examples: + .. code-block:: python + + input = embedding + batch_size = 20 + max_len = 100 + dropout_prob = 0.2 + input_size = 100 + hidden_size = 150 + num_layers = 1 + init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) + init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) + + rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \ + max_len, dropout_prob, input_size, hidden_size, \ + num_layers) + """ + + helper = LayerHelper('cudnn_lstm', **locals()) + + weight_size = 0 + for i in range(num_layers): + if i == 0: + input_weight_size = (input_size * hidden_size) * 4 + else: + if is_bidirec: + input_weight_size = (hidden_size * 2 * hidden_size) * 4 + else: + input_weight_size = (hidden_size * hidden_size) * 4 + + hidden_weight_size = (hidden_size * hidden_size) * 4 + + if is_bidirec: + weight_size += (input_weight_size + hidden_weight_size) * 2 + weight_size += hidden_size * 8 * 2 + else: + weight_size += input_weight_size + hidden_weight_size + weight_size += hidden_size * 8 + + weight = helper.create_parameter( + attr=helper.param_attr, + shape=[weight_size], + dtype=dtype, + default_initializer=default_initializer) + + out = helper.create_variable_for_type_inference(dtype) + last_h = helper.create_variable_for_type_inference(dtype) + last_c = helper.create_variable_for_type_inference(dtype) + + cache = helper.create_variable( + persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True) + + helper.append_op( + type='cudnn_lstm', + inputs={ + 'Input': input, + 'InitH': init_h, + 'InitC': init_c, + 'W': weight, + 'Cache': cache, + }, + outputs={ + 'Out': out, + 'last_h': last_h, + 'last_c': last_c, + }, + attrs={ + 'max_len': max_len, + 'is_bidirec': is_bidirec, + 'input_size': input_size, + 'batch_size': batch_size, + 'hidden_size': hidden_size, + 'num_layers': num_layers, + 'is_test': is_test, + 'dropout_prob': dropout_prob, + 'fix_seed': fix_seed, + 'seed': seed, + }) + return out, last_h, last_c + + def dynamic_lstmp(input, size, proj_size, From 084ff6574d8122afa256651aec7abb55260220f0 Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 27 Nov 2018 16:40:26 +0800 Subject: [PATCH 022/719] add cudnn lstm; test=develop --- python/paddle/fluid/layers/nn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ef46bbb271..d2df819c08 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -525,6 +525,9 @@ def cudnn_lstm(input, default_initializer(Initialize|None): Where use initializer to initialize the Weight If set None, defaule initializer will be used + fix_seed(bool): If it's True, fix seed will used for dropout in LSTM + seed(int): If fix_seed is True, dropout seed in LSTM will use this seed + Returns: rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) From fe915901cdb7c0d55fe13890e3afafcce4cddbf9 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Wed, 28 Nov 2018 20:54:46 +0800 Subject: [PATCH 023/719] update Opdesc's HasAttr test=develop --- paddle/fluid/framework/op_desc.cc | 11 +++++++++++ paddle/fluid/framework/op_desc.h | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index e8ecd90502..a31c5336a1 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -237,6 +237,17 @@ void OpDesc::SetOutput(const std::string ¶m_name, this->outputs_[param_name] = args; } +bool OpDesc::HasAttr(const std::string &name) const { + const proto::OpProto &proto = OpInfoMap::Instance().Get(desc_.type()).Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + return false; +} + proto::AttrType OpDesc::GetAttrType(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 30c8a26c3d..3da7cdcef3 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -61,9 +61,7 @@ class OpDesc { void SetOutput(const std::string ¶m_name, const std::vector &args); - bool HasAttr(const std::string &name) const { - return attrs_.find(name) != attrs_.end(); - } + bool HasAttr(const std::string &name) const; proto::AttrType GetAttrType(const std::string &name) const; From 1d19eb2bd440bd391d292d207fe7c8351fce4d99 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Thu, 15 Nov 2018 16:48:01 -0800 Subject: [PATCH 024/719] Implemented ngraph engine test=develop --- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/ngraph_bridge.cc | 88 +++++- paddle/fluid/framework/ngraph_bridge.h | 14 +- paddle/fluid/framework/ngraph_operator.cc | 363 +++++++++++++++++++++- paddle/fluid/framework/ngraph_operator.h | 7 +- 5 files changed, 449 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 52946c7f11..94b3af9159 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -126,8 +126,9 @@ cc_library(version SRCS version.cc) cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto) + if(NOT WIN32) +cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) endif(NOT WIN32) diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 8177436d0b..45ef0211ad 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -15,23 +15,105 @@ limitations under the License. */ #ifdef PADDLE_WITH_NGRAPH #include #include +#include #include "paddle/fluid/framework/ngraph_bridge.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" #include "ngraph/ngraph.hpp" namespace paddle { namespace framework { +static std::shared_ptr GetNode( + const std::shared_ptr& op, const std::string prm, + const VariableNameMap& var_map, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto& var_names = var_map.at(prm); + PADDLE_ENFORCE_EQ(var_names.size(), 1, + "op %s prm %s expects one associated var", op->Type(), prm); + if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { + return (*ngb_node_map)[var_names[0]]; + } else { + return nullptr; + } +} + +static std::shared_ptr GetInputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + return GetNode(op, prm, op->Inputs(), ngb_node_map); +} + +static std::shared_ptr GetOutputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + return GetNode(op, prm, op->Outputs(), ngb_node_map); +} + +static void SetOutputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr node, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto& var_names = op->Outputs().at(prm); + if (var_names.size() == 1) { + (*ngb_node_map)[var_names[0]] = node; + } else if (var_names.size() == 0) { + (*ngb_node_map)[""] = node; + } else { + PADDLE_THROW("prm %s has more than 1 var_names.", prm); + } +} + +static bool HasOutput(const std::shared_ptr& op, + const std::string prm) { + auto& outputs = op->Outputs(); + if (outputs.find(prm) == outputs.end()) return false; + return outputs.at(prm).size() > 0; +} + +template +static void BuildBinaryNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = GetInputNode(op, "X", ngb_node_map); + auto y = GetInputNode(op, "Y", ngb_node_map); + auto out = std::make_shared(x, y); + SetOutputNode(op, "Out", out, ngb_node_map); +} + +template +static void BuildUnaryNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto input = GetInputNode(op, "X", ngb_node_map); + auto out = std::make_shared(input); + SetOutputNode(op, "Out", out, ngb_node_map); +} + std::map&, std::shared_ptr>>)>> - NgraphBridge::NG_NODE_MAP = {}; + NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode}, + {"tanh", BuildUnaryNode}}; -void NgraphBridge::build_graph(const std::shared_ptr& op) { +void NgraphBridge::BuildNgGraph(const std::shared_ptr& op) { auto& op_type = op->Type(); - NG_NODE_MAP[op_type](op, ngb_node_map); + NG_NODE_MAP[op_type](op, ngb_node_map_); } } // namespace framework diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h index 55bf0d21f3..3cf62b6daa 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -20,16 +20,14 @@ limitations under the License. */ #include #include #include -#include -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" - -#include "ngraph/ngraph.hpp" +#include "ngraph/node.hpp" namespace paddle { namespace framework { +class OperatorBase; + class NgraphBridge { public: static std::map< @@ -43,14 +41,14 @@ class NgraphBridge { std::shared_ptr< std::unordered_map>> var_node_map) - : ngb_node_map(var_node_map) {} + : ngb_node_map_(var_node_map) {} - void build_graph(const std::shared_ptr& op); + void BuildNgGraph(const std::shared_ptr& op); private: std::shared_ptr< std::unordered_map>> - ngb_node_map; + ngb_node_map_; }; } // namespace framework diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index d967b2780c..e9ff051355 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -19,14 +19,29 @@ limitations under the License. */ #include #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/ngraph_operator.h" -#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type.h" +#include "ngraph/ngraph.hpp" + namespace paddle { namespace framework { +static ngraph::Shape Ddim2Shape(const DDim& dims) { + ngraph::Shape sp; + for (int i = 0; i < dims.size(); ++i) { + int k = dims[i]; + k = k == 0 ? 1 : k; + sp.push_back(k); + } + return sp; +} + static std::map pd2ng_type_map = { {proto::VarType::FP32, ngraph::element::f32}, {proto::VarType::FP64, ngraph::element::f64}, @@ -59,13 +74,23 @@ class NgraphOperator { persistables_(persist), fetches_(fetches), post_op_inputs_(post_op_inputs), - ng_op_state_(ng_op_state) {} + ng_op_state_(ng_op_state) { + var_in_node_map_ = std::make_shared< + std::unordered_map>>(); + + var_node_map_ = std::make_shared< + std::unordered_map>>(); + + BuildNgIO(); + + GetNgFunction(); + } void Run(const Scope& scope, const platform::Place& place) const; private: static std::unordered_map> - func_cache; + func_cache_; const Scope& scope_; const platform::Place& place_; std::vector> fused_ops_; @@ -74,6 +99,35 @@ class NgraphOperator { std::unordered_set fetches_; std::unordered_set post_op_inputs_; op_state ng_op_state_; + + static std::shared_ptr backend_; + + std::shared_ptr ngraph_function_; + // var_name of inputs + std::vector var_in_; + // var_name of outputs from fetch in order + std::vector var_out_; + + std::shared_ptr< + std::unordered_map>> + var_in_node_map_; + + // map each var name with a ngraph node + std::shared_ptr< + std::unordered_map>> + var_node_map_; + + std::shared_ptr GetCacheKey(); + + void GetNgInputShape(std::shared_ptr op); + + void BuildNgNode(); + + void BuildNgIO(); + + void BuildNgFunction(); + + void GetNgFunction(); }; std::vector>::iterator>> @@ -86,7 +140,7 @@ FusedOperator::FusedOpIntervals( } size_t size = ops->size(); size_t left = 0; - while (left < size && ops.at(left)->Type() != kFeedOpType) { + while (left < size && ops->at(left)->Type() != kFeedOpType) { ++left; } if (left == size) { @@ -116,7 +170,7 @@ FusedOperator::FusedOpIntervals( size_t start = pivot, end = start; while (pivot < right && (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops.at(pivot)->Type()) != + ops->at(pivot)->Type()) != paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { ++pivot; ++end; @@ -136,7 +190,9 @@ FusedOperator::FusedOperator( std::vector>::iterator end, const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) { + : OperatorBase(type, inputs, outputs, attrs), + pdesc_(prog), + block_(block_id) { for (std::vector>::iterator it = start; it != end; ++it) { fused_ops_.push_back(std::move(*it)); @@ -152,7 +208,7 @@ FusedOperator::FusedOperator( } if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { - is_complete = true; + is_full_ = true; } Process(); @@ -205,7 +261,7 @@ void FusedOperator::RunImpl(const Scope& scope, } } - if (is_full) { + if (is_full_) { ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; } @@ -215,6 +271,297 @@ void FusedOperator::RunImpl(const Scope& scope, ngraph_op.Run(scope, place); } +std::unordered_map> + NgraphOperator::func_cache_ = {}; + +std::shared_ptr NgraphOperator::backend_ = + ngraph::runtime::Backend::create("CPU"); + +void NgraphOperator::GetNgInputShape(std::shared_ptr op) { + RuntimeInferShapeContext infer_shape_ctx(*op, scope_); + std::shared_ptr op_k = + std::dynamic_pointer_cast(op); + op_k->InferShape(&infer_shape_ctx); + + for (auto& var_name_item : op->Inputs()) { + std::vector vshape; + auto& var_prm_name = var_name_item.first; + auto var_name_size = var_name_item.second.size(); + if (var_name_size == 1) { + auto dim = infer_shape_ctx.GetInputDim(var_prm_name); + vshape.push_back(Ddim2Shape(dim)); + } else if (var_name_item.second.size() > 1) { + auto vdim = infer_shape_ctx.GetInputsDim(var_prm_name); + PADDLE_ENFORCE_EQ(vdim.size(), var_name_item.second.size(), + "Need dim info for each var"); + for (auto& dim : vdim) { + vshape.push_back(Ddim2Shape(dim)); + } + } else { + // 0 size : conv2d Bias + } + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto var_name = var_name_item.second.at(i); + if (std::find(var_in_.begin(), var_in_.end(), var_name) != + var_in_.end()) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto ng_type = var_type_map_.at(var_name); + auto prm = std::make_shared( + ng_type, vshape.at(i), true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } + } + } +} + +void NgraphOperator::BuildNgNode() { + for (auto& var_name : var_out_) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && VarIsTensor(*var)) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + + paddle::framework::NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgGraph(op); + } +} + +void NgraphOperator::BuildNgIO() { + std::unordered_set inputs; + std::unordered_set outputs; + + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + inputs.insert(var_name); + const bool is_output = outputs.find(var_name) != outputs.end(); + if (!is_output && + std::find(var_in_.begin(), var_in_.end(), var_name) == + var_in_.end()) { + // fill var_in here to keep lhs and rhs order + var_in_.push_back(var_name); + } + } + } + + if (op->Type() != "fill_constant") { + GetNgInputShape(op); + } + + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + outputs.insert(var_name); + } + } + } + + // var_out.clear(); + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + switch (ng_op_state_) { + case PARTIAL_TEST: + if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || + fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case FULL_TEST: + if (fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case PARTIAL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + post_op_inputs_.find(var_name) != post_op_inputs_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + case FULL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + default: + var_out_.push_back(var_name); + } + } + } + } +} + +void NgraphOperator::BuildNgFunction() { + BuildNgNode(); + ngraph_function_ = nullptr; + ngraph::NodeVector func_outputs; + ngraph::op::ParameterVector func_inputs; + + for (auto& vo : var_out_) { + func_outputs.push_back(var_node_map_->at(vo)); + } + + for (auto& vi : var_in_) { + std::shared_ptr prm = + std::dynamic_pointer_cast( + var_in_node_map_->at(vi)); + func_inputs.push_back(prm); + } + + ngraph_function_ = + std::make_shared(func_outputs, func_inputs); +} + +std::shared_ptr NgraphOperator::GetCacheKey() { + auto cache_key = std::make_shared(""); + *cache_key += std::to_string(fused_ops_.size()); + for (auto& op : fused_ops_) { + *cache_key += op->Type(); + } + for (auto& var_name : var_in_) { + auto shape = var_node_map_->at(var_name)->get_shape(); + *cache_key += var_name; + *cache_key += var_type_map_.at(var_name).c_type_string(); + for (size_t i = 0; i < shape.size(); ++i) { + *cache_key += std::to_string(shape.at(i)); + } + } + + for (auto& var_name : var_out_) { + auto* var = scope_.FindVar(var_name); + if (var && VarIsTensor(*var)) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + for (int i = 0; i < ddim.size(); ++i) { + *cache_key += std::to_string(ddim[i]); + } + } + } + return cache_key; +} + +void NgraphOperator::GetNgFunction() { + bool cache_on = true; + if (cache_on) { + std::string cache_key_val = *GetCacheKey(); + if (func_cache_.find(cache_key_val) != func_cache_.end()) { + ngraph_function_ = func_cache_.at(cache_key_val); + } else { + BuildNgFunction(); + func_cache_[cache_key_val] = ngraph_function_; + } + } else { + BuildNgFunction(); + } +} + +void NgraphOperator::Run(const Scope& scope, + const platform::Place& place) const { + std::vector> t_in; + std::vector> t_out; + + for (size_t i = 0; i < var_in_.size(); ++i) { + auto vi = var_in_.at(i); + auto sp = var_node_map_->at(vi)->get_shape(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && VarIsTensor(*var)) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + if (tensor_pd->type().hash_code() == + typeid(float).hash_code()) { // NOLINT + const float* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::f32, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(int).hash_code()) { // NOLINT + const int* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i32, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) { + const int64_t* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i64, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(double).hash_code()) { // NOLINT + const double* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::f64, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(bool).hash_code()) { // NOLINT + const bool* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::boolean, sp, + const_cast(arr)); + } else { + PADDLE_THROW("Data type not handling for var %s", vi); + } + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) + ? true + : false; + bool is_persistable = + (persistables_.find(vi) != persistables_.end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + t_in.push_back(ti); + } + + for (size_t i = 0; i < var_out_.size(); ++i) { + auto var_name = var_out_[i]; + auto* var = scope.FindVar(var_name); + std::shared_ptr to; + if (var && VarIsTensor(*var)) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + auto dd = tensor_pd->dims(); + ngraph::Shape sp = Ddim2Shape(dd); + auto ng_type = var_type_map_.at(var_name); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handled in for var %s", var_name); + } + t_out.push_back(to); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); + } + } + + backend_->call(ngraph_function_, t_out, t_in); +} // NgraphOperator::RunImpl } // namespace framework } // namespace paddle #endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 0f655cef1d..3ca023e111 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -17,24 +17,19 @@ limitations under the License. */ #ifdef PADDLE_WITH_NGRAPH #include -#include #include #include #include #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/variant.h" -#include "ngraph/ngraph.hpp" +#include "ngraph/type/element_type.hpp" namespace paddle { namespace framework { From caf4b937b35aa3cb504ac0d6b76d663945b59cb7 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Tue, 27 Nov 2018 11:02:10 -0800 Subject: [PATCH 025/719] Added RunInferShape test=develop --- paddle/fluid/framework/ngraph_operator.cc | 47 ++++++++--------------- paddle/fluid/framework/operator.cc | 8 +++- paddle/fluid/framework/operator.h | 6 +++ 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e9ff051355..8878917da1 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,39 +278,22 @@ std::shared_ptr NgraphOperator::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphOperator::GetNgInputShape(std::shared_ptr op) { - RuntimeInferShapeContext infer_shape_ctx(*op, scope_); - std::shared_ptr op_k = - std::dynamic_pointer_cast(op); - op_k->InferShape(&infer_shape_ctx); - + op->RunInferShape(scope_, place_); for (auto& var_name_item : op->Inputs()) { - std::vector vshape; - auto& var_prm_name = var_name_item.first; - auto var_name_size = var_name_item.second.size(); - if (var_name_size == 1) { - auto dim = infer_shape_ctx.GetInputDim(var_prm_name); - vshape.push_back(Ddim2Shape(dim)); - } else if (var_name_item.second.size() > 1) { - auto vdim = infer_shape_ctx.GetInputsDim(var_prm_name); - PADDLE_ENFORCE_EQ(vdim.size(), var_name_item.second.size(), - "Need dim info for each var"); - for (auto& dim : vdim) { - vshape.push_back(Ddim2Shape(dim)); - } - } else { - // 0 size : conv2d Bias - } - - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto var_name = var_name_item.second.at(i); - if (std::find(var_in_.begin(), var_in_.end(), var_name) != - var_in_.end()) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto ng_type = var_type_map_.at(var_name); - auto prm = std::make_shared( - ng_type, vshape.at(i), true); - (*var_node_map_)[var_name] = prm; - (*var_in_node_map_)[var_name] = prm; + for (auto& var_name : var_name_item.second) { + auto* var = scope_.FindVar(var_name); + if (var && VarIsTensor(*var)) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + if (std::find(var_in_.begin(), var_in_.end(), var_name) != + var_in_.end()) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } } } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8bfdf38912..a816aa94c0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -355,7 +355,7 @@ void OperatorBase::GenerateTemporaryNames() { } } -static bool VarIsTensor(const Variable& var) { +bool VarIsTensor(const Variable& var) { return var.IsType() || var.IsType(); } @@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name, "Tensor %s contains NAN", name); } +void OperatorWithKernel::RunInferShape(const Scope& scope, + const platform::Place& place) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 5bd68f9ac2..fcf889f3db 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -64,6 +64,7 @@ inline std::string GradVarName(const std::string& var_name) { } proto::VarType::Type GetDataTypeOfVar(const Variable* var); +bool VarIsTensor(const Variable& var); const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var); Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); @@ -128,6 +129,8 @@ class OperatorBase { virtual std::vector OutputVars(bool has_intermediate) const; void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } + virtual void RunInferShape(const Scope& scope, + const platform::Place& place) const {} protected: std::string type_; @@ -348,6 +351,9 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } + void RunInferShape(const Scope& scope, + const platform::Place& place) const override; + protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetKernelTypeForVar( From d6125a5eec073c595f84dfbc987bd0daad6742c6 Mon Sep 17 00:00:00 2001 From: Sang Ik Lee Date: Mon, 19 Nov 2018 15:44:38 -0800 Subject: [PATCH 026/719] Include ngraph in inference demo build. test=develop --- cmake/inference_lib.cmake | 9 +++++++++ .../fluid/inference/api/demo_ci/CMakeLists.txt | 17 ++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 0b95a78072..c679d8507d 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -129,6 +129,15 @@ if (WITH_MKLDNN) ) endif () +if (WITH_NGRAPH) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph") + copy(ngraph_lib + SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR} + DSTS ${dst_dir} ${dst_dir} + DEPS ngraph + ) +endif () + if (NOT WIN32) if (NOT MOBILE_INFERENCE AND NOT RPI) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 8fb464c0f5..d58486c5f0 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -79,6 +79,21 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (NOT WIN32) + set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") + if(EXISTS ${NGRAPH_PATH}) + include_directories("${NGRAPH_PATH}/include") + if(UNIX AND NOT APPLE) + include(GNUInstallDirs) + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + link_directories("${NGRAPH_PATH}/lib") + set(NGRAPH_LIB ${NGRAPH_PATH}/lib/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() + endif() +endif() + add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) if(WITH_MKL) @@ -106,7 +121,7 @@ endif() if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} glog gflags protobuf snappystream snappy z xxhash ${EXTERNAL_LIB}) else() From a29696146cb9a403700faa0662e0a7f3c589a79e Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Mon, 19 Nov 2018 16:31:16 -0800 Subject: [PATCH 027/719] Added annotation test=develop --- paddle/fluid/framework/ngraph_operator.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 8878917da1..99c4cf0da6 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -57,6 +57,7 @@ typedef enum { /* nGraph support state on ops */ PARTIAL_TEST /* Support partial list of ops for test */ } op_state; +// perform graph build through bridge and execute computation class NgraphOperator { public: explicit NgraphOperator(const Scope& scope, const platform::Place& place, @@ -100,33 +101,33 @@ class NgraphOperator { std::unordered_set post_op_inputs_; op_state ng_op_state_; + // ngraph backend eg. CPU static std::shared_ptr backend_; - + // ngraph function to call and execute std::shared_ptr ngraph_function_; // var_name of inputs std::vector var_in_; // var_name of outputs from fetch in order std::vector var_out_; - + // map input vars to nodes std::shared_ptr< std::unordered_map>> var_in_node_map_; - // map each var name with a ngraph node std::shared_ptr< std::unordered_map>> var_node_map_; - + // cache key to check if function is cached std::shared_ptr GetCacheKey(); - + // get ngraph input and define ngraph input parameters void GetNgInputShape(std::shared_ptr op); - + // Call ngraph bridge to map ops void BuildNgNode(); - + // get the ngraph input and output var list void BuildNgIO(); - + // build ngraph function call void BuildNgFunction(); - + // Check cache for ngraph function or otherwise build the function void GetNgFunction(); }; From 24e70920db7f034c4d6c7f85456dcf3115fbac6d Mon Sep 17 00:00:00 2001 From: Sang Ik Lee Date: Tue, 20 Nov 2018 15:47:49 -0800 Subject: [PATCH 028/719] Refactor some build settings. test=develop --- cmake/external/ngraph.cmake | 18 +++++---------- .../inference/api/demo_ci/CMakeLists.txt | 11 +++------ python/setup.py.in | 23 +++++++++++-------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 2e335579f3..e66459fa3a 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH}) return() ENDIF() +INCLUDE(GNUInstallDirs) + INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") @@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) +SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") +SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) +SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) +SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) ExternalProject_Add( ${NGRAPH_PROJECT} @@ -63,18 +69,6 @@ ExternalProject_Add( CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib ) -if(UNIX AND NOT APPLE) - include(GNUInstallDirs) - SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) -else() - SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib) -endif() -MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}") - -SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) -SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) -SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) - # Workaround for nGraph expecting mklml to be in mkldnn install directory. ExternalProject_Add_Step( ${NGRAPH_PROJECT} diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index d58486c5f0..ec93729cd2 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -82,15 +82,10 @@ link_directories("${PADDLE_LIB}/paddle/lib") if (NOT WIN32) set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") if(EXISTS ${NGRAPH_PATH}) + include(GNUInstallDirs) include_directories("${NGRAPH_PATH}/include") - if(UNIX AND NOT APPLE) - include(GNUInstallDirs) - link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") - set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) - else() - link_directories("${NGRAPH_PATH}/lib") - set(NGRAPH_LIB ${NGRAPH_PATH}/lib/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) - endif() + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() endif() diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54..5aee26b638 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] -if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - if '${WITH_MKLDNN}' == 'ON': +if '${WITH_MKLDNN}' == 'ON': + if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode. # TODO(typhoonzero): use install_name_tool to patch mkl libs once # we can support mkl on mac. # @@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" if os.system(command) != 0: raise Exception("patch libmkldnn.so failed, command: %s" % command) - package_data['paddle.libs']+=['libmkldnn.so.0'] - shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) + package_data['paddle.libs']+=['libmkldnn.so.0'] + shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) if '${WITH_NGRAPH}' == 'ON': + # only change rpath in Release mode, + # since in Debug mode, nGraph lib may be too large to be changed? if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" - if os.system(command) != 0: - raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) + if os.name != 'nt': + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}" + else: + command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) shutil.copy('${NGRAPH_SHARED_LIB}', libs_path) shutil.copy('${NGRAPH_CPU_LIB}', libs_path) shutil.copy('${NGRAPH_TBB_LIB}', libs_path) From e6bd53be60676c8f73e1cc80705f6e599db1985c Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Tue, 27 Nov 2018 21:38:54 -0800 Subject: [PATCH 029/719] Named to RuntimeInferShape test=develop --- paddle/fluid/framework/ngraph_operator.cc | 2 +- paddle/fluid/framework/operator.cc | 4 ++-- paddle/fluid/framework/operator.h | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 99c4cf0da6..61bae1aba4 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -279,7 +279,7 @@ std::shared_ptr NgraphOperator::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphOperator::GetNgInputShape(std::shared_ptr op) { - op->RunInferShape(scope_, place_); + op->RuntimeInferShape(scope_, place_); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope_.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a816aa94c0..f3d225df69 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -695,8 +695,8 @@ static void CheckTensorNANOrInf(const std::string& name, "Tensor %s contains NAN", name); } -void OperatorWithKernel::RunInferShape(const Scope& scope, - const platform::Place& place) const { +void OperatorWithKernel::RuntimeInferShape(const Scope& scope, + const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index fcf889f3db..efc9a1b6f5 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -129,8 +129,8 @@ class OperatorBase { virtual std::vector OutputVars(bool has_intermediate) const; void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } - virtual void RunInferShape(const Scope& scope, - const platform::Place& place) const {} + virtual void RuntimeInferShape(const Scope& scope, + const platform::Place& place) const {} protected: std::string type_; @@ -351,8 +351,8 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } - void RunInferShape(const Scope& scope, - const platform::Place& place) const override; + void RuntimeInferShape(const Scope& scope, + const platform::Place& place) const override; protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; From d5ee05e6c376929e416854f8864a672c6cc84958 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Tue, 27 Nov 2018 23:06:17 -0800 Subject: [PATCH 030/719] Replaced VarIsTensor test=develop --- paddle/fluid/framework/ngraph_operator.cc | 10 +++++----- paddle/fluid/framework/operator.cc | 2 +- paddle/fluid/framework/operator.h | 1 - 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 61bae1aba4..1c770a2370 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -283,7 +283,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr op) { for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope_.FindVar(var_name); - if (var && VarIsTensor(*var)) { + if (var && var->IsType()) { auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto sp = Ddim2Shape(tensor_pd->dims()); if (std::find(var_in_.begin(), var_in_.end(), var_name) != @@ -305,7 +305,7 @@ void NgraphOperator::BuildNgNode() { for (auto& var_name : var_out_) { if (var_node_map_->find(var_name) == var_node_map_->end()) { auto* var = scope_.FindVar(var_name); - if (var && VarIsTensor(*var)) { + if (var && var->IsType()) { auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto& ddim = tensor_pd->dims(); auto ng_shape = Ddim2Shape(ddim); @@ -433,7 +433,7 @@ std::shared_ptr NgraphOperator::GetCacheKey() { for (auto& var_name : var_out_) { auto* var = scope_.FindVar(var_name); - if (var && VarIsTensor(*var)) { + if (var && var->IsType()) { auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto& ddim = tensor_pd->dims(); for (int i = 0; i < ddim.size(); ++i) { @@ -469,7 +469,7 @@ void NgraphOperator::Run(const Scope& scope, auto sp = var_node_map_->at(vi)->get_shape(); std::shared_ptr ti; auto* var = scope.FindVar(vi); - if (var && VarIsTensor(*var)) { + if (var && var->IsType()) { auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), "Ensure ngraph tensor layout align with paddle tensor"); @@ -518,7 +518,7 @@ void NgraphOperator::Run(const Scope& scope, auto var_name = var_out_[i]; auto* var = scope.FindVar(var_name); std::shared_ptr to; - if (var && VarIsTensor(*var)) { + if (var && var->IsType()) { auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); auto dd = tensor_pd->dims(); ngraph::Shape sp = Ddim2Shape(dd); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f3d225df69..c6f3254e9f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -355,7 +355,7 @@ void OperatorBase::GenerateTemporaryNames() { } } -bool VarIsTensor(const Variable& var) { +static bool VarIsTensor(const Variable& var) { return var.IsType() || var.IsType(); } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index efc9a1b6f5..0a6a28a5bc 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -64,7 +64,6 @@ inline std::string GradVarName(const std::string& var_name) { } proto::VarType::Type GetDataTypeOfVar(const Variable* var); -bool VarIsTensor(const Variable& var); const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var); Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); From 0cc9ab3dc2ee2974916703a872d33ba59bdad713 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 26 Nov 2018 17:00:26 +0800 Subject: [PATCH 031/719] enable API check for readers test=develop --- paddle/fluid/API.spec | 17 +++++++++++++++++ paddle/scripts/paddle_build.sh | 2 +- tools/print_signatures.py | 8 +++++--- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c40f603341..092008aa86 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -411,3 +411,20 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) +paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) +paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) +paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) +paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) +paddle.reader.ComposeNotAligned.__init__ +paddle.reader.ComposeNotAligned.args.__init__ +paddle.reader.ComposeNotAligned.message.__init__ +paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) +paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) +paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) +paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) +paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) +paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a6720fa798..63707245e5 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -454,7 +454,7 @@ function assert_api_not_changed() { virtualenv .env source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl - python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec + python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec diff --git a/tools/print_signatures.py b/tools/print_signatures.py index e2805c4e7e..83f6434dfd 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -15,7 +15,7 @@ Print all signature of a python module in alphabet order. Usage: - ./print_signature "paddle.fluid" > signature.txt + ./print_signature "paddle.fluid,paddle.reader" > signature.txt """ from __future__ import print_function @@ -30,7 +30,7 @@ member_dict = collections.OrderedDict() def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) - if inspect.isclass(member): + if inspect.isclass(member) or inspect.isgetsetdescriptor(member): for name, value in inspect.getmembers(member): if hasattr(value, '__name__') and (not name.startswith("_") or name == "__init__"): @@ -63,7 +63,9 @@ def visit_all_module(mod): visit_member(mod.__name__, instance) -visit_all_module(importlib.import_module(sys.argv[1])) +modules = sys.argv[1].split(",") +for m in modules: + visit_all_module(importlib.import_module(m)) for name in member_dict: print(name, member_dict[name]) From ad6ed5b745dbaa64dc6ba3c48443adbc63c50aa7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 28 Nov 2018 18:58:57 +0800 Subject: [PATCH 032/719] fix py3 test=develop --- paddle/fluid/API.spec | 2 -- paddle/scripts/paddle_build.sh | 1 + tools/print_signatures.py | 5 +++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 092008aa86..5e0756e539 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -417,8 +417,6 @@ paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', def paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) paddle.reader.ComposeNotAligned.__init__ -paddle.reader.ComposeNotAligned.args.__init__ -paddle.reader.ComposeNotAligned.message.__init__ paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 63707245e5..32de51afb2 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -459,6 +459,7 @@ function assert_api_not_changed() { # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec + sed -i 's/\(.*ComposeNotAligned.*\).__init__ ArgSpec(args=.*/\1.__init__ /g' new.spec fi python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 83f6434dfd..5c5266f904 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -30,7 +30,7 @@ member_dict = collections.OrderedDict() def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) - if inspect.isclass(member) or inspect.isgetsetdescriptor(member): + if inspect.isclass(member): for name, value in inspect.getmembers(member): if hasattr(value, '__name__') and (not name.startswith("_") or name == "__init__"): @@ -43,7 +43,8 @@ def visit_member(parent_name, member): line.strip() for line in pydoc.render_doc(member).split('\n') if "->" in line ]) - + elif inspect.isgetsetdescriptor(member): + return else: raise RuntimeError("Unsupported generate signature of member, type {0}". format(str(type(member)))) From 75939c20593465196c4225cb927d61442dab2273 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 29 Nov 2018 13:33:28 +0800 Subject: [PATCH 033/719] fix test=develop --- paddle/fluid/API.spec | 1 - paddle/scripts/paddle_build.sh | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 5e0756e539..38acd6e15f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -416,7 +416,6 @@ paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=N paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) -paddle.reader.ComposeNotAligned.__init__ paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 32de51afb2..24e488afba 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -459,7 +459,8 @@ function assert_api_not_changed() { # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec - sed -i 's/\(.*ComposeNotAligned.*\).__init__ ArgSpec(args=.*/\1.__init__ /g' new.spec + # ComposeNotAligned has significant different between py2 and py3 + sed -i '/.*ComposeNotAligned.*/d' new.spec fi python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate From da7dfa5d95563ae09d3acd3d9c0e7a359ce8219d Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 29 Nov 2018 09:05:23 +0000 Subject: [PATCH 034/719] add mac ci check on import --- paddle/scripts/paddle_build.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index dbb73f7a27..bdc4210640 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -442,6 +442,7 @@ EOF make install -j 8 if [ "$1" == "cp27-cp27m" ]; then pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + python -c "import paddle.fluid" elif [ "$1" == "cp35-cp35m" ]; then pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl elif [ "$1" == "cp36-cp36m" ]; then @@ -449,7 +450,7 @@ EOF elif [ "$1" == "cp37-cp37m" ]; then pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi From 412425379669ddc278e6c3f5f7f5dd7c884de1e8 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 29 Nov 2018 09:05:23 +0000 Subject: [PATCH 035/719] add mac ci check on import, test=develop --- paddle/scripts/paddle_build.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index dbb73f7a27..bdc4210640 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -442,6 +442,7 @@ EOF make install -j 8 if [ "$1" == "cp27-cp27m" ]; then pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + python -c "import paddle.fluid" elif [ "$1" == "cp35-cp35m" ]; then pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl elif [ "$1" == "cp36-cp36m" ]; then @@ -449,7 +450,7 @@ EOF elif [ "$1" == "cp37-cp37m" ]; then pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi From d1a17cadd48be68a3abde522e6dc94a608db207d Mon Sep 17 00:00:00 2001 From: phlrain Date: Thu, 29 Nov 2018 17:17:52 +0800 Subject: [PATCH 036/719] fix cudnn rnn; test=develop --- paddle/fluid/operators/cudnn_lstm_op.cc | 38 +++++++---- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 9 ++- paddle/fluid/operators/cudnn_lstm_op.h | 7 ++- python/paddle/fluid/layers/nn.py | 73 ++++++++++++---------- 4 files changed, 78 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc index cadc5b8830..c73c64f4a8 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -122,13 +122,11 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { "The will affect the shape of the Out, last_h, and last_c") .SetDefault(false); AddAttr("input_size", "input size ot the Input Tensor").SetDefault(10); - AddAttr("batch_size", "the instance number the batch").SetDefault(10); AddAttr("hidden_size", "hidden size of the LSTM").SetDefault(100); AddAttr("num_layers", "the total layer number of the LSTM") .SetDefault(1); AddAttr("is_test", "True if in test phase.").SetDefault(false); - AddAttr("fix_seed", "True if it fix dropout seed").SetDefault(false); - AddAttr("seed", "seed to used if fix_seed is True").SetDefault(0); + AddAttr("seed", "seed to used if fix_seed is True").SetDefault(-1); AddComment(R"DOC( CUDNN LSTM implementation @@ -136,16 +134,32 @@ A four-gate Long Short-Term Memory network with no peephole connections. In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: -it = σ(Wi X xt + Ri X ht-1 + bWi + bRi) -ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf) -ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo) -c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc) -ct = ft * ct-1 + it * c't -ht = ot * tanh(ct) +$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ -Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, +$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + +$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + +$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + +$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + +$$ h_t = o_t \\odot tanh(c_t) $$ + +- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) +- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). +- sigmoid is the logistic sigmoid function. +- $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. +- The $\odot$ is the element-wise product of the vectors. +- `tanh` is the activation functions. +- $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + +Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, X represensts a matrix multiplication -and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively. )DOC"); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 9caf65b53f..cadd3772af 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -273,7 +273,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { size_t max_len = ctx.Attr("max_len"); float dropout_prob = ctx.Attr("dropout_prob"); bool is_bidirec = ctx.Attr("is_bidirec"); - int batch_size = ctx.Attr("batch_size"); int input_size = ctx.Attr("input_size"); int hidden_size = ctx.Attr("hidden_size"); int num_layers = ctx.Attr("num_layers"); @@ -304,9 +303,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); std::random_device rnd; - int seed = ctx.Attr("fix_seed") ? ctx.Attr("seed") : rnd(); + int seed = ctx.Attr("seed"); + if (seed == -1) { + seed = rnd(); + } auto input_w_numel = w->numel(); + auto batch_size = x->dims()[1]; cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, hidden_size, num_layers, dropout_prob, is_bidirec, seed, input_w_numel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/cudnn_lstm_op.h index fb4b37e46e..fc329cc239 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.h +++ b/paddle/fluid/operators/cudnn_lstm_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,7 +29,10 @@ using Tensor = framework::Tensor; template class CudnnLSTMKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override {} + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW( + "CPU is not support for this kernel now. Will be add in the future"); + } }; template diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 30d108bbe8..fa2215f9f5 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,7 +169,7 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', - 'cudnn_lstm', + 'lstm', ] @@ -467,39 +467,53 @@ def dynamic_lstm(input, return hidden, cell -def cudnn_lstm(input, - init_h, - init_c, - batch_size, - max_len, - dropout_prob, - input_size, - hidden_size, - num_layers, - is_bidirec=False, - dtype='float32', - is_test=False, - name=None, - default_initializer=None, - fix_seed=False, - seed=0): +def lstm(input, + init_h, + init_c, + max_len, + dropout_prob, + input_size, + hidden_size, + num_layers, + is_bidirec=False, + dtype='float32', + is_test=False, + name=None, + default_initializer=None, + seed=-1): """ - CUDNN LSTM implementation + If Device is GPU, This op will use cudnn LSTM implementation A four-gate Long Short-Term Memory network with no peephole connections. In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: - it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi) - ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf) - ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo) - c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc) - ct = ft * ct-1 + it * c't - ht = ot * tanh(ct) + $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + + $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + + $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + + $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + + $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + + $$ h_t = o_t \\odot tanh(c_t) $$ + + - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) + - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). + - sigmoid is the logistic sigmoid function. + - $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. + - The $\odot$ is the element-wise product of the vectors. + - `tanh` is the activation functions. + - $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, X represensts a matrix multiplication - and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively. Args: @@ -510,7 +524,6 @@ def cudnn_lstm(input, init_c(Variable): The initial cell state of the LSTM. This is a tensor with shape ( num_layers x batch_size x hidden_size ) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) - batch_size (int): total distance numer of the batch max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps There is NO dropout work on rnn output of the last RNN layers @@ -524,9 +537,7 @@ def cudnn_lstm(input, will be named automatically. default_initializer(Initialize|None): Where use initializer to initialize the Weight If set None, defaule initializer will be used - - fix_seed(bool): If it's True, fix seed will used for dropout in LSTM - seed(int): If fix_seed is True, dropout seed in LSTM will use this seed + seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed Returns: @@ -553,7 +564,7 @@ def cudnn_lstm(input, init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) - rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \ + rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \ max_len, dropout_prob, input_size, hidden_size, \ num_layers) """ @@ -610,12 +621,10 @@ def cudnn_lstm(input, 'max_len': max_len, 'is_bidirec': is_bidirec, 'input_size': input_size, - 'batch_size': batch_size, 'hidden_size': hidden_size, 'num_layers': num_layers, 'is_test': is_test, 'dropout_prob': dropout_prob, - 'fix_seed': fix_seed, 'seed': seed, }) return out, last_h, last_c From 4b9689379f690aebc6b0d953b998b3ab2dcd3f81 Mon Sep 17 00:00:00 2001 From: phlrain Date: Thu, 29 Nov 2018 17:19:47 +0800 Subject: [PATCH 037/719] fix cudnn lstm; test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c27078a6e4..dd9ff7cb7b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -187,7 +187,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.cudnn_lstm ArgSpec(args=['input', 'init_h', 'init_c', 'batch_size', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'fix_seed', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0)) +paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) From 589b863b986b4ab3bae4c572c89a201df4a3edc7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 29 Nov 2018 17:20:20 +0800 Subject: [PATCH 038/719] Add EstiminateFlops test=develop --- paddle/fluid/framework/details/op_registry.h | 20 +++++++++++++++++--- paddle/fluid/framework/op_info.h | 7 +++++++ paddle/fluid/framework/type_defs.h | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index eea7e712f8..1ce18c3d6b 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -32,7 +32,9 @@ enum OpInfoFillType { kOpProtoAndCheckerMaker = 1, kGradOpDescMaker = 2, kVarTypeInference = 3, - kShapeInference = 4 + kShapeInference = 4, + kEstimateFlops = 5, + kUnknown = -1 }; template @@ -48,8 +50,10 @@ struct OpInfoFillTypeID { ? kVarTypeInference : (std::is_base_of::value ? kShapeInference - : static_cast( - -1))))); + : (std::is_base_of::value + ? kEstimateFlops + : kUnknown))))); } }; @@ -139,6 +143,16 @@ struct OpInfoFiller { } }; +template +struct OpInfoFiller { + void operator()(const char* op_tpe, OpInfo* info) const { + info->estimate_flops_ = [](InferShapeContext* ctx) { + T estimate_flops; + return estimate_flops(ctx); + }; + } +}; + } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 19e5c2c73e..e0bf5ed999 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -31,6 +31,12 @@ class InferShapeBase { virtual void operator()(InferShapeContext*) const = 0; }; +class EstimateFlopsBase { + public: + virtual ~EstimateFlopsBase() = default; + virtual size_t operator()(InferShapeContext*) const = 0; +}; + struct OpInfo { OpCreator creator_; GradOpMakerFN grad_op_maker_; @@ -38,6 +44,7 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; + EstimateFlopsFN estimate_flops_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 2de6233a9e..cdc5fa6862 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -54,5 +54,7 @@ using InferVarTypeFN = using InferShapeFN = std::function; +using EstimateFlopsFN = std::function; + } // namespace framework } // namespace paddle From 5db273d8740791622f123fe8c4a6bc3eef4f934d Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 29 Nov 2018 12:05:25 +0800 Subject: [PATCH 039/719] enhance HasAttr to fix ci test=develop --- paddle/fluid/framework/op_desc.cc | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index a31c5336a1..ce7ba96730 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -238,11 +238,21 @@ void OpDesc::SetOutput(const std::string ¶m_name, } bool OpDesc::HasAttr(const std::string &name) const { - const proto::OpProto &proto = OpInfoMap::Instance().Get(desc_.type()).Proto(); - for (int i = 0; i != proto.attrs_size(); ++i) { - const proto::OpProto::Attr &attr = proto.attrs(i); - if (attr.name() == name) { - return true; + if (attrs_.find(name) != attrs_.end()) { + return true; + } else { + auto &op_info = OpInfoMap::Instance(); + if (op_info.Has(desc_.type())) { + auto op_info_ptr = op_info.Get(desc_.type()); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + } } } return false; From 40f1c4a6f0f76d3511625d1ccd9352eea83f5144 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 29 Nov 2018 18:53:39 +0800 Subject: [PATCH 040/719] fix test=develop --- paddle/scripts/paddle_build.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 24e488afba..aa2d2e0db1 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -459,9 +459,10 @@ function assert_api_not_changed() { # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec - # ComposeNotAligned has significant different between py2 and py3 - sed -i '/.*ComposeNotAligned.*/d' new.spec fi + # ComposeNotAligned has significant difference between py2 and py3 + sed -i '/.*ComposeNotAligned.*/d' new.spec + python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate } From 92f5be1d82b1d6994ad18ac09f3e926bd1b631c4 Mon Sep 17 00:00:00 2001 From: phlrain Date: Thu, 29 Nov 2018 19:04:38 +0800 Subject: [PATCH 041/719] remove inputvarname in operator; test=develop --- paddle/fluid/framework/operator.h | 8 -------- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 7289a451e5..5bd68f9ac2 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -174,14 +174,6 @@ class ExecutionContext { return op_.Inputs(name).size(); } - const std::string InputVarName(const std::string& name) const { - return op_.Input(name); - } - - const std::string OutputVarName(const std::string& name) const { - return op_.Output(name); - } - size_t OutputSize(const std::string& name) const { return op_.Outputs(name).size(); } diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index cadd3772af..811975a9f3 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -292,7 +292,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { // multi-devices before the first running. // use parent scope to make cache persistable auto *scope = const_cast(ctx.scope().parent()); - auto cache_var_name = ctx.InputVarName("Cache"); + auto cache_var_name = ctx.Inputs("Cache")[0]; cache_var = scope->Var(cache_var_name); } CudnnRNNCache *cudnn_rnn_cache = nullptr; From bd94ab0ef3e172fab40aa316914dbae8868c9f41 Mon Sep 17 00:00:00 2001 From: phlrain Date: Thu, 29 Nov 2018 19:21:11 +0800 Subject: [PATCH 042/719] rename op; test=develop --- paddle/fluid/operators/{cudnn_lstm_op.cc => lstm_cudnn_op.cc} | 2 +- .../operators/{cudnn_lstm_op.cu.cc => lstm_cudnn_op.cu.cc} | 2 +- paddle/fluid/operators/{cudnn_lstm_op.h => lstm_cudnn_op.h} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename paddle/fluid/operators/{cudnn_lstm_op.cc => lstm_cudnn_op.cc} (99%) rename paddle/fluid/operators/{cudnn_lstm_op.cu.cc => lstm_cudnn_op.cu.cc} (99%) rename paddle/fluid/operators/{cudnn_lstm_op.h => lstm_cudnn_op.h} (100%) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/lstm_cudnn_op.cc similarity index 99% rename from paddle/fluid/operators/cudnn_lstm_op.cc rename to paddle/fluid/operators/lstm_cudnn_op.cc index c73c64f4a8..ca60fb4b0b 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/lstm_cudnn_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cudnn_lstm_op.h" +#include "paddle/fluid/operators/lstm_cudnn_op.h" #include #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/lstm_cudnn_op.cu.cc similarity index 99% rename from paddle/fluid/operators/cudnn_lstm_op.cu.cc rename to paddle/fluid/operators/lstm_cudnn_op.cu.cc index 811975a9f3..7a67bbe539 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/lstm_cudnn_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cudnn_lstm_op.h" +#include "paddle/fluid/operators/lstm_cudnn_op.h" #include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/lstm_cudnn_op.h similarity index 100% rename from paddle/fluid/operators/cudnn_lstm_op.h rename to paddle/fluid/operators/lstm_cudnn_op.h From c856ac8721d6ce41e4cb89c9d6fb369fb0f8d739 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Thu, 29 Nov 2018 20:30:44 +0800 Subject: [PATCH 043/719] add OpHasAttr in node.h, update is_test_pass and mkldnn_placement_pass test=develop --- paddle/fluid/framework/ir/is_test_pass.cc | 2 +- .../fluid/framework/ir/is_test_pass_tester.cc | 4 +-- .../framework/ir/mkldnn_placement_pass.cc | 2 +- paddle/fluid/framework/ir/node.cc | 26 ++++++++++++++++++- paddle/fluid/framework/ir/node.h | 2 ++ 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 292f232ffc..a61bd5f291 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr IsTestPass::ApplyImpl( for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (op->HasAttr("is_test")) { + if (n->OpHasAttr("is_test")) { op->SetAttr("is_test", true); } else if (std::find(begin(op_list), end(op_list), op->Type()) != end(op_list)) { diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index 9696441a21..a5fb0abb3c 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { auto* op = node->Op(); auto op_name = boost::get(op->GetAttr("name")); if (op_name == "conv3") { - ASSERT_FALSE(op->HasAttr("is_test")); + ASSERT_FALSE(node->OpHasAttr("is_test")); } else { - ASSERT_TRUE(op->HasAttr("is_test")); + ASSERT_TRUE(node->OpHasAttr("is_test")); EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); } } diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 65be69b7f5..366057b01e 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -22,7 +22,7 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { VLOG(3) << "Aplies MKL-DNN placement strategy."; for (const Node* n : graph->Nodes()) { - if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) { + if (n->IsOp() && n->OpHasAttr("use_mkldnn")) { n->Op()->SetAttr("use_mkldnn", true); } } diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 50d9113088..4c4da10b04 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_info.h" namespace paddle { namespace framework { @@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[]; const char Node::kControlDepVarName[] = "__control_var"; #endif -std::unique_ptr CreateNodeForTest(const std::string& name, +std::unique_ptr CreateNodeForTest(const std::string &name, Node::Type type) { return std::unique_ptr(new Node(name, type)); } + +bool Node::OpHasAttr(const std::string &name) const { + if (Op()->HasAttr(name)) { + return true; + } else { + auto &op_info = OpInfoMap::Instance(); + auto op_type = Op()->Type(); + if (op_info.Has(op_type)) { + auto op_info_ptr = op_info.Get(op_type); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + } + } + } + return false; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f1..ac08006a49 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -108,6 +108,8 @@ class Node { Name().find(ir::Node::kControlDepVarName) != std::string::npos; } + bool OpHasAttr(const std::string& name) const; + std::vector inputs; std::vector outputs; From 096673f67527b0fed1aab1843041b9d929fd0fb5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 29 Nov 2018 13:20:29 +0000 Subject: [PATCH 044/719] refactor eager deletion test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 12 +- .../details/computation_op_handle.cc | 6 +- .../framework/details/computation_op_handle.h | 6 +- .../details/eager_deletion_op_handle.cc | 117 ++++++++++ .../details/eager_deletion_op_handle.h | 64 ++++++ .../framework/details/eager_deletion_pass.cc | 96 ++++++++ .../framework/details/eager_deletion_pass.h | 32 +++ .../details/multi_devices_graph_pass.cc | 6 +- .../details/reference_count_op_handle.h | 138 ------------ .../framework/details/reference_count_pass.cc | 213 +++++------------- .../framework/details/reference_count_pass.h | 5 - .../details/reference_count_pass_helper.h | 49 ++++ .../scope_buffered_ssa_graph_executor.cc | 30 +-- .../scope_buffered_ssa_graph_executor.h | 4 + paddle/fluid/framework/garbage_collector.h | 12 +- paddle/fluid/framework/ir/graph.h | 11 +- paddle/fluid/framework/ir/pass.h | 11 +- paddle/fluid/framework/parallel_executor.cc | 106 ++++++--- paddle/fluid/framework/parallel_executor.h | 24 +- paddle/fluid/platform/CMakeLists.txt | 9 +- .../fluid/platform/stream_callback_manager.cc | 70 ++++++ .../fluid/platform/stream_callback_manager.h | 51 +---- 22 files changed, 631 insertions(+), 441 deletions(-) create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.cc create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.h create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.cc create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.h delete mode 100644 paddle/fluid/framework/details/reference_count_op_handle.h create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.h create mode 100644 paddle/fluid/platform/stream_callback_manager.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fe..8cf97d667d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -33,10 +33,9 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -if (WITH_GPU) - cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) -endif() +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) @@ -44,10 +43,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) -if (WITH_GPU) - list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) -endif() +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c60..7beb8c8de9 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -20,11 +20,13 @@ namespace paddle { namespace framework { namespace details { ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, - platform::Place place) + platform::Place place, + size_t scope_idx) : OpHandleBase(node), op_(framework::OpRegistry::CreateOp(*node->Op())), scope_(scope), - place_(place) {} + place_(place), + scope_idx_(scope_idx) {} void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 662a91d6b4..601ae4f8c6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -28,7 +28,8 @@ namespace framework { namespace details { struct ComputationOpHandle : public OpHandleBase { public: - ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place); + ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, + size_t scope_idx); std::string Name() const override; @@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase { void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; } + size_t GetScopeIdx() const { return scope_idx_; } + protected: void RunImpl() override; @@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase { std::unique_ptr op_; Scope *scope_; platform::Place place_; + size_t scope_idx_; bool is_lock_and_record_event_free_{false}; }; } // namespace details diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc new file mode 100644 index 0000000000..cd26203376 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace framework { +namespace details { + +EagerDeletionOpHandle::EagerDeletionOpHandle( + ir::Node *node, const Scope *scope, const platform::Place &place, + const std::vector &var_names, GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + dev_ctx_ = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + if (dynamic_cast *>(gc_)) { + platform::SetDeviceId(boost::get(place).device); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } + } +#endif + + for (auto &name : var_names) AddVar(name); +} + +EagerDeletionOpHandle::~EagerDeletionOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + platform::SetDeviceId(gpu_place.device); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif +} + +std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } + +void EagerDeletionOpHandle::AddVar(const std::string &name) { + var_names_.insert(name); +} + +void EagerDeletionOpHandle::RunImpl() { + auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + std::vector tensors; + for (auto &name : var_names_) { + auto it = ref_cnts_->find(name); + if (it == ref_cnts_->end()) { + continue; + } + + auto *var = exec_scope->FindVar(name); + if (var == nullptr) { + continue; + } + + if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + tensors.emplace_back(var->GetMutable()); + } + } else if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + tensors.emplace_back(var->GetMutable()->mutable_value()); + } + } else if (var->IsType()) { + if (it->second.fetch_sub(1) == 1) { + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + tensors.emplace_back(&t); + } + } + } + } + + if (!tensors.empty()) { + ClearTensors(tensors); + } +} + +void EagerDeletionOpHandle::ClearTensors(const std::vector &tensors) { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = + static_cast *>(gc_)->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { +#endif + gc_->Add(tensors); +#ifdef PADDLE_WITH_CUDA + } +#endif +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h new file mode 100644 index 0000000000..8254f21bdf --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace details { + +class EagerDeletionPass; + +class EagerDeletionOpHandle : public OpHandleBase { + public: + EagerDeletionOpHandle(ir::Node *node, const Scope *scope, + const platform::Place &place, + const std::vector &var_names, + GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts); + + ~EagerDeletionOpHandle(); + + std::string Name() const override; + + protected: + void RunImpl() override; + + private: + void ClearTensors(const std::vector &tensors); + + void AddVar(const std::string &name); + + const Scope *scope_; + std::unordered_set var_names_; + GarbageCollector *gc_; // not own + AtomicReferenceCountMap *ref_cnts_; // not own +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext *dev_ctx_{nullptr}; + cudaEvent_t event_{nullptr}; +#endif + + friend class EagerDeletionPass; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc new file mode 100644 index 0000000000..f877c2881c --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, + ir::Graph *graph) { + auto it = std::find_if( + in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != in->Outputs().end()) { + out->AddInput(*it); + } else { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + in->AddOutput(dep_var); + out->AddInput(dep_var); + } + + // Add leaf node to eager_deletion_node + if (out->Outputs().empty()) { + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + out->AddOutput(dummy_leaf); + } +} + +std::unique_ptr EagerDeletionPass::ApplyImpl( + std::unique_ptr graph) const { + auto &vars = graph->Get(kGraphVars); + + auto &ref_cnts = + Get>(kCurReferenceCount); + auto &last_live_ops = Get>(kLastLiveOpsOfVars); + auto &gcs = Get(kGarbageCollector); + + ref_cnts = std::vector(vars.size()); + + std::unordered_map op_map; + for (auto &var_ops_map : last_live_ops) { + for (auto &var_ops_pair : var_ops_map) { + const std::string &var_name = var_ops_pair.first; + for (ComputationOpHandle *op : var_ops_pair.second) { + auto it = op_map.find(op); + if (it != op_map.end()) { + it->second->AddVar(var_name); + } else { + auto *eager_deletion_node = graph->CreateEmptyNode( + "eager_deletion", ir::Node::Type::kOperation); + auto *eager_deletion_op = new EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name}, + gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()])); + AddDependencyBetween(op, eager_deletion_op, graph.get()); + op_map[op] = eager_deletion_op; + } + } + } + } + VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)"; + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(eager_deletion_pass, + paddle::framework::details::EagerDeletionPass) + .RequirePassAttr(paddle::framework::details::kCurReferenceCount) + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h new file mode 100644 index 0000000000..d7a7a9709d --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index a36ad25926..97830386e4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -562,7 +562,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), - local_scopes_[dev_id], places_[dev_id])); + local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } @@ -685,8 +685,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get(kGraphOps).emplace_back( - new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); + result->Get(kGraphOps).emplace_back(new ComputationOpHandle( + result->CreateOpNode(node->Op()), s, p, scope_idx)); CreateOpHandleIOs(result, node, scope_idx); } } diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h deleted file mode 100644 index cc4ccfbdfc..0000000000 --- a/paddle/fluid/framework/details/reference_count_op_handle.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace framework { -namespace details { - -using ReferenceCountMap = std::unordered_map; -using AtomicReferenceCountMap = - std::unordered_map>; -using DeviceReferenceCountMap = - std::unordered_map>; -using AtomicDeviceReferenceCountMap = - std::unordered_map>; -using DeviceGarbageCollectorMap = - std::unordered_map>>; - -class ReferenceCountOpHandle : public OpHandleBase { - public: - ReferenceCountOpHandle(ir::Node *node, const Scope *scope, - const platform::CUDAPlace &place, - const std::vector &var_names, - GarbageCollector *gc, - AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { - dev_ctx_ = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - if (IsStreamGarabageCollector()) { - platform::SetDeviceId(place.device); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } - - for (auto &name : var_names) AddVar(name); - } - - ~ReferenceCountOpHandle() { - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get(dev_ctx_->GetPlace()); - platform::SetDeviceId(gpu_place.device); - PADDLE_ENFORCE(cudaEventDestroy(event_)); - } - } - - std::string Name() const override { return "reference_count"; } - - void AddVar(const std::string &name) { - auto it = var_names_.find(name); - if (it != var_names_.end()) - ++(it->second); - else - var_names_[name] = 1; - } - - protected: - void RunImpl() override { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; - for (auto &pair : var_names_) { - auto &name = pair.first; - auto it = ref_cnts_->find(name); - if (it == ref_cnts_->end()) continue; - - auto *var = exec_scope->FindVar(name); - if (var == nullptr) continue; - - if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back(var->GetMutable()); - } - } else if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back( - var->GetMutable()->mutable_value()); - } - } - } - - if (!tensors.empty()) { - ClearTensors(tensors); - } - } - - private: - void ClearTensors(const std::vector &tensors) { - auto *gc = dynamic_cast *>(gc_); - if (gc != nullptr) { - auto compute_stream = dev_ctx_->stream(); - auto callback_stream = gc->stream(); - auto callback_func = [=]() { - PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); - }; - gc_->Add(tensors, callback_func); - } else { - gc_->Add(tensors); - } - } - - bool IsStreamGarabageCollector() const { - return dynamic_cast *>(gc_) != nullptr; - } - - const Scope *scope_; - platform::CUDADeviceContext *dev_ctx_; - std::unordered_map var_names_; - GarbageCollector *gc_; // not own - AtomicReferenceCountMap *ref_cnts_; // not own - cudaEvent_t event_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 08783fb5f8..f094c7afa9 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -17,184 +17,96 @@ #include #include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/reference_count_pass.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace details { -static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) { - std::queue queue; - queue.push(var_in); +static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( + OpHandleBase *op, size_t scope_idx) { + std::queue q; + std::unordered_set visited; + q.push(op); do { - auto *var = queue.front(); - queue.pop(); - for (auto *op : var->PendingOps()) { - auto *compute_op = dynamic_cast(op); - if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { - return compute_op; - } - for (auto *out_var : op->Outputs()) { - queue.push(out_var); + auto *op = q.front(); + q.pop(); + auto *compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) { + return compute_op; + } + for (auto *out_var : op->Outputs()) { + for (auto *pending_op : out_var->PendingOps()) { + if (visited.count(pending_op)) continue; + visited.insert(pending_op); } } - } while (!queue.empty()); + } while (!q.empty()); return nullptr; } -static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, - ir::Graph *graph) { - auto it = std::find_if( - in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { - return dynamic_cast(var) != nullptr; - }); - - if (it != in->Outputs().end()) { - out->AddInput(*it); - } else { - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dep_var); - in->AddOutput(dep_var); - out->AddInput(dep_var); - } -} - std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { - auto &ref_cnts = Get(kGlobalReferenceCount); - auto &cur_ref_cnts = Get(kCurReferenceCount); - auto &gcs = Get(kGarbageCollector); - - // It is not easy to find the right reference counts of varaibles in graph - // Step 1: Find all variables in computation ops - // Step 2: Find all variables in non-computation ops which refers to variables - // in computation ops - std::unordered_set names; - std::unordered_map - compute_ref_cnt_map; - - auto get_ref_cnts_from_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - std::vector var_names_in_op; - auto *compute_op = dynamic_cast(op); - if (compute_op == nullptr || - !platform::is_gpu_place(compute_op->GetPlace())) - return var_names_in_op; - auto place = boost::get(compute_op->GetPlace()); - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - if (!platform::is_gpu_place(var_handle->place_) || - boost::get(var_handle->place_) != place) + auto &vars = graph->Get(kGraphVars); + auto &ref_cnts = Get>(kGlobalReferenceCount); + auto &last_live_ops_of_vars = + Get>(kLastLiveOpsOfVars); + + last_live_ops_of_vars = std::vector(vars.size()); + ref_cnts = std::vector(vars.size()); + + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + if (name_var_pair.second.empty()) continue; + auto *last_ver_var = name_var_pair.second.back(); + + VarDesc *var_desc = nullptr; + std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(), + [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + + if (var_desc == nullptr || var_desc->Persistable()) { continue; - - VarDesc *var_desc = var_handle->Node()->Var(); - auto var_name = var_handle->Node()->Name(); - - // This is weird but there is really some variables without var_desc - // in computation_op - if (var_desc == nullptr) { - var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name); - if (var_desc == nullptr) continue; } - if (var_desc->Persistable()) continue; auto var_type = var_desc->Proto()->type().type(); if (var_type != proto::VarType::LOD_TENSOR && - var_type != proto::VarType::SELECTED_ROWS) { + var_type != proto::VarType::SELECTED_ROWS && + var_type != proto::VarType::LOD_TENSOR_ARRAY) { continue; } - // compute op only runs in one device - if (ref_cnts[place.device]->count(var_name)) - ++(*ref_cnts[place.device])[var_name]; - else - (*ref_cnts[place.device])[var_name] = 1; - - names.insert(var_name); - var_names_in_op.push_back(var_name); - } - return var_names_in_op; - }; - - auto update_ref_cnts_from_non_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - if (dynamic_cast(op) != nullptr) return; - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - auto var_name = var_handle->Node()->Name(); - auto var_place = var_handle->place_; - if (!platform::is_gpu_place(var_place)) continue; - auto place = boost::get(var_place); - if (names.count(var_name) == 0) continue; - if (ref_cnts.count(place.device) && - ref_cnts[place.device]->count(var_name)) { - ++(*ref_cnts[place.device])[var_name]; - - auto *next_compute_op = FindNextComputationOpHandle(var_handle); - if (next_compute_op != nullptr) { - if (compute_ref_cnt_map.count(next_compute_op)) { - compute_ref_cnt_map[next_compute_op]->AddVar(var_name); - VLOG(5) << "Add reference count of " << var_name << " to Operator " - << next_compute_op->Name(); - } else { - // Create new reference_count_op_handle - ir::Node *ref_cnt_node = graph->CreateEmptyNode( - "reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, next_compute_op->GetScope(), place, {var_name}, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[next_compute_op] = ref_cnt_handle; - } + std::unordered_set last_live_op; + auto add_last_live_op = [&](OpHandleBase *op) { + auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); + if (compute_op) { + last_live_op.insert(compute_op); + } + }; + const std::string &var_name = name_var_pair.first; + auto &pending_ops = last_ver_var->PendingOps(); + if (pending_ops.empty()) { + auto *generated_op = last_ver_var->GeneratedOp(); + if (generated_op) { + ref_cnts[i].emplace(var_name, 1); + add_last_live_op(generated_op); + } + } else { + ref_cnts[i].emplace(var_name, pending_ops.size()); + for (auto *pending_op : pending_ops) { + add_last_live_op(pending_op); } } - } - }; - auto all_ops = ir::FilterByNodeWrapper(*graph); - for (auto &op : all_ops) { - auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); - auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs()); - if (in_var_names.empty() && out_var_names.empty()) continue; - in_var_names.insert(in_var_names.end(), out_var_names.begin(), - out_var_names.end()); - auto *compute_op = dynamic_cast(op); - auto place = boost::get(compute_op->GetPlace()); - ir::Node *ref_cnt_node = - graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, compute_op->GetScope(), place, in_var_names, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[compute_op] = ref_cnt_handle; - } - - for (auto &op : all_ops) { - update_ref_cnts_from_non_compute_op(op, op->Inputs()); - update_ref_cnts_from_non_compute_op(op, op->Outputs()); - } - - std::vector new_all_ops; - new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size()); - for (auto &op : all_ops) { - new_all_ops.emplace_back(std::move(op)); - auto it = compute_ref_cnt_map.find(new_all_ops.back()); - if (it != compute_ref_cnt_map.end()) { - // Add LeafNode to ReferenceCountOpHandle - auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dummy_leaf); - it->second->AddOutput(dummy_leaf); - new_all_ops.emplace_back(std::move(it->second)); + last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); } } - - all_ops.swap(new_all_ops); return graph; } @@ -205,5 +117,4 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( REGISTER_PASS(reference_count_pass, paddle::framework::details::ReferenceCountPass) .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount) - .RequirePassAttr(paddle::framework::details::kCurReferenceCount) - .RequirePassAttr(paddle::framework::details::kGarbageCollector); + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars); diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h index 7081280b06..bcbef02735 100644 --- a/paddle/fluid/framework/details/reference_count_pass.h +++ b/paddle/fluid/framework/details/reference_count_pass.h @@ -14,7 +14,6 @@ #pragma once -#include "paddle/fluid/framework/details/reference_count_op_handle.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -22,10 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kGlobalReferenceCount[] = "reference_count"; -constexpr char kCurReferenceCount[] = "current_reference_count"; -constexpr char kGarbageCollector[] = "garbage_collector"; - class ReferenceCountPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h new file mode 100644 index 0000000000..77846f7bdf --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ComputationOpHandle; + +using ReferenceCountMap = std::unordered_map; + +using AtomicReferenceCountMap = + std::unordered_map>; + +using GarbageCollectorList = + std::vector>>; + +const char kGlobalReferenceCount[] = "reference_count"; +const char kCurReferenceCount[] = "current_reference_count"; +const char kGarbageCollector[] = "garbage_collector"; + +using LastLiveOpsOfVars = + std::unordered_map>; +const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index e5b1eaa731..f1bf6542a3 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -18,9 +18,6 @@ #include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/platform/profiler.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_op_handle.h" -#endif namespace paddle { namespace framework { @@ -33,7 +30,11 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), var_infos_(std::move(var_infos)), - places_(std::move(places)) {} + places_(std::move(places)) { + if (Graph().Has(details::kGarbageCollector)) { + gc_ = &(Graph().Get(details::kGarbageCollector)); + } +} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { @@ -69,27 +70,16 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; -#ifdef PADDLE_WITH_CUDA - const std::string gc_name = "garbage_collector"; - DeviceGarbageCollectorMap *gc = - Graph().Has(gc_name) ? &(Graph().Get(gc_name)) - : nullptr; -#endif - if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); -#ifdef PADDLE_WITH_CUDA - if (gc != nullptr && platform::is_gpu_place(p)) { - auto gpu_place = boost::get(p); - auto &gc_at_place = gc->at(gpu_place.device); - gc_at_place->Wait(); - gc_at_place->Reset(); + for (size_t i = 0; i < places_.size(); ++i) { + platform::DeviceContextPool::Instance().Get(places_[i])->Wait(); + if (gc_) { + (*gc_)[i]->Wait(); + (*gc_)[i]->Reset(); } -#endif } for (auto &scope : local_scopes_) { auto &local_scope = diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50..ce3061d6e6 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -21,9 +21,11 @@ #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" + namespace paddle { namespace framework { namespace details { @@ -55,6 +57,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector var_infos_; std::vector places_; + + GarbageCollectorList* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 818b3334ea..cbe8f606ef 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -65,7 +65,7 @@ class GarbageCollector { if (clear_deque != nullptr) { callback(); - ClearCallback([=]() { + ClearCallback([clear_deque]() { for (auto *obj : *clear_deque) obj->clear(); }); } @@ -109,7 +109,6 @@ class DefaultStreamGarbageCollector : public GarbageCollector { } void Wait() const override { - this->dev_ctx_->Wait(); static_cast(this->dev_ctx_) ->WaitStreamCallback(); } @@ -127,14 +126,14 @@ class StreamGarbageCollector : public GarbageCollector { StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - PADDLE_ENFORCE(cudaSetDevice(place.device)); + platform::SetDeviceId(place.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } ~StreamGarbageCollector() { auto place = boost::get(this->dev_ctx_->GetPlace()); - PADDLE_ENFORCE(cudaSetDevice(place.device)); + platform::SetDeviceId(place.device); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } @@ -148,8 +147,11 @@ class StreamGarbageCollector : public GarbageCollector { cudaStream_t stream() const { return stream_; } protected: + // ClearCallback and Wait()/Reset() cannot be call in multiple threads + // But it is not important, because they would not be called in multiple + // threads + // either in Executor or ParallelExecutor void ClearCallback(const std::function &callback) override { - std::lock_guard guard(this->mutex_); callback_manager_->AddCallback(callback); } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 947c934f0f..7a2560c14d 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -73,14 +73,21 @@ class Graph { } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } template AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } template diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index a3559247db..27746ff145 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -51,11 +51,18 @@ class Pass { AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), "%s attr not registered for pass.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } void Erase(const std::string &attr_name) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee77..e71f93beef 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -49,6 +50,15 @@ class ParallelExecutorPrivate { } } } + + void ResetRuntimeReferenceCount() { + for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) { + for (auto &pair : rt_ref_cnts_[i]) { + rt_cur_ref_cnts_[i][pair.first] = pair.second; + } + } + } + std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -60,6 +70,13 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; + + // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then + // keeps unchanged + // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_ + std::vector rt_ref_cnts_; + std::vector rt_cur_ref_cnts_; + details::GarbageCollectorList gcs_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -128,35 +145,56 @@ ParallelExecutor::ParallelExecutor( std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); +#else + std::unique_ptr graph = + build_strategy.Apply(main_program, member_->places_, loss_var_name, + params, member_->local_scopes_, member_->use_cuda_); +#endif auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - for (auto &place : member_->places_) { - if (!platform::is_gpu_place(place)) continue; - auto gpu_place = boost::get(place); - if (gcs_[gpu_place.device] == nullptr) { - ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap()); - cur_ref_cnts_[gpu_place.device].reset( - new details::AtomicReferenceCountMap()); - gcs_[gpu_place.device].reset( - new StreamGarbageCollector(gpu_place, max_memory_size)); + size_t place_num = member_->places_.size(); + for (size_t i = 0; i < place_num; ++i) { + auto &place = member_->places_[i]; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + member_->gcs_.emplace_back(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else if (platform::is_cpu_place(place)) { +#endif + member_->gcs_.emplace_back(new CPUGarbageCollector( + boost::get(place), max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#ifdef PADDLE_WITH_CUDA } - } - if (!gcs_.empty()) { - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = ref_cnt_pass->Apply(std::move(graph)); - graph->SetNotOwned("garbage_collector", &gcs_); +#endif } } -#else - std::unique_ptr graph = - build_strategy.Apply(main_program, member_->places_, loss_var_name, - params, member_->local_scopes_, member_->use_cuda_); -#endif + + if (!member_->gcs_.empty()) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, + &(member_->rt_ref_cnts_)); + ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + VLOG(10) << "ReferenceCountPass Applied"; + graph = ref_cnt_pass->Apply(std::move(graph)); + + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(details::kCurReferenceCount, + &(member_->rt_cur_ref_cnts_)); + eager_deletion_pass->SetNotOwned(details::kGarbageCollector, + &(member_->gcs_)); + eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + graph = eager_deletion_pass->Apply(std::move(graph)); + VLOG(10) << "EagerDeletionPass Applied"; + } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars @@ -271,18 +309,16 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); -#ifdef PADDLE_WITH_CUDA - if (!gcs_.empty()) { - ResetReferenceCount(); - for (auto &pair : cur_ref_cnts_) { - auto &name_map = *(pair.second); + if (!member_->gcs_.empty()) { + member_->ResetRuntimeReferenceCount(); + size_t n = member_->rt_ref_cnts_.size(); + for (size_t i = 0; i < n; ++i) { for (auto &fetch_name : fetch_tensors) { - name_map.erase(fetch_name); + member_->rt_cur_ref_cnts_[i].erase(fetch_name); } - name_map.erase(fetched_var_name); + member_->rt_cur_ref_cnts_[i].erase(fetched_var_name); } } -#endif auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; @@ -326,13 +362,11 @@ ParallelExecutor::~ParallelExecutor() { for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - // member_ must be destructed before gcs_ since the destructor of - // ReferenceCountOpHandle use raw pointers of gcs_ inside. - member_.reset(); + delete member_; } } // namespace framework } // namespace paddle -#ifdef PADDLE_WITH_CUDA + USE_PASS(reference_count_pass); -#endif +USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ef09b98b2a..1fc17a0d64 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include #include #include @@ -29,10 +28,6 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_pass.h" -#endif - namespace paddle { namespace framework { @@ -75,24 +70,7 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; - std::unique_ptr member_; - -#ifdef PADDLE_WITH_CUDA - // ref_cnts_ is only initialized when ParallelExecutor constructs, and then - // keeps unchanged - // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_ - details::DeviceReferenceCountMap ref_cnts_; - details::AtomicDeviceReferenceCountMap cur_ref_cnts_; - details::DeviceGarbageCollectorMap gcs_; - - void ResetReferenceCount() { - for (auto &pair1 : ref_cnts_) { - for (auto &pair2 : *(pair1.second)) { - (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second; - } - } - } -#endif + ParallelExecutorPrivate *member_; }; } // namespace framework diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 93cb5eb2dc..23c7ebe842 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -56,9 +56,16 @@ ELSE() set(MKLDNN_CTX_DEPS) ENDIF() +nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +IF(WITH_GPU) + set(STREAM_CALLBACK_DEPS stream_callback_manager) +ELSE() + set(STREAM_CALLBACK_DEPS) +ENDIF() + # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc +cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc new file mode 100644 index 0000000000..ae915365f8 --- /dev/null +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/stream_callback_manager.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +struct StreamCallbackContext { + inline StreamCallbackContext(const StreamCallbackManager *manager, + std::function callback) + : manager_(manager), callback_(std::move(callback)) {} + + const StreamCallbackManager *manager_; // do not own + std::function callback_; +}; + +StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) + : stream_(stream), thread_pool_(new ::ThreadPool(1)) {} + +void StreamCallbackManager::AddCallback(std::function callback) const { + auto *stream_callback_context = + new StreamCallbackContext(this, std::move(callback)); +#if CUDA_VERSION >= 10000 + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, + StreamCallbackManager::StreamCallbackFunc, + stream_callback_context)); +#else + PADDLE_ENFORCE( + cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); +#endif +} + +void StreamCallbackManager::Wait() const { + thread_pool_.reset(new ::ThreadPool(1)); +} + +#if CUDA_VERSION >= 10000 +void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data) +#else +void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, + void *user_data) +#endif +{ + auto *callback_context_ptr = + reinterpret_cast(user_data); + callback_context_ptr->manager_->thread_pool_->enqueue( + [callback_context_ptr]() { + std::unique_ptr callback_context( + callback_context_ptr); + callback_context->callback_(); + }); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index ed8734c98c..eac4806d13 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -19,66 +19,29 @@ #include #include #include -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { -class StreamCallbackManager; - -struct StreamCallbackContext { - template - inline StreamCallbackContext(const StreamCallbackManager *manager, - Callback &&callback) - : manager_(manager), callback_(callback) {} - - const StreamCallbackManager *manager_; // do not own - std::function callback_; -}; - +// NOTE(zjl): clean StreamCallback to make compilation faster class StreamCallbackManager { public: - explicit inline StreamCallbackManager(cudaStream_t stream = nullptr) - : stream_(stream), thread_pool_(new ThreadPool(1)) {} + explicit StreamCallbackManager(const cudaStream_t stream); - template - inline void AddCallback(Callback &&callback) const { - auto *stream_callback_context = - new StreamCallbackContext(this, std::forward(callback)); -#if CUDA_VERSION >= 10000 - PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context)); // NOLINT -#else - PADDLE_ENFORCE(cudaStreamAddCallback( - stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0)); // NOLINT -#endif - } + void AddCallback(std::function callback) const; - void Wait() const { thread_pool_.reset(new ThreadPool(1)); } + void Wait() const; private: const cudaStream_t stream_; - mutable std::unique_ptr thread_pool_; + mutable std::unique_ptr<::ThreadPool> thread_pool_; -// cudaStreamCallback cannot call CUDA API inside, so we have to use -// thread_pool here #if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data) + static void CUDART_CB StreamCallbackFunc(void *user_data); #else static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data) + cudaError_t status, void *user_data); #endif - { - auto *callback_context_ptr = - reinterpret_cast(user_data); - callback_context_ptr->manager_->thread_pool_->enqueue([=]() { - std::unique_ptr callback_context( - callback_context_ptr); - callback_context->callback_(); - }); - } }; } // namespace platform From fc61bf1b168992bcedf5b645a841f6ea8a52d449 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Thu, 29 Nov 2018 12:06:16 -0800 Subject: [PATCH 045/719] Renamed methods test=develope --- paddle/fluid/framework/ngraph_bridge.cc | 2 +- paddle/fluid/framework/ngraph_bridge.h | 2 +- paddle/fluid/framework/ngraph_operator.cc | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 45ef0211ad..e22c290377 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -111,7 +111,7 @@ std::map}, {"tanh", BuildUnaryNode}}; -void NgraphBridge::BuildNgGraph(const std::shared_ptr& op) { +void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { auto& op_type = op->Type(); NG_NODE_MAP[op_type](op, ngb_node_map_); } diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h index 3cf62b6daa..9ed6b95109 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -43,7 +43,7 @@ class NgraphBridge { var_node_map) : ngb_node_map_(var_node_map) {} - void BuildNgGraph(const std::shared_ptr& op); + void BuildNgNode(const std::shared_ptr& op); private: std::shared_ptr< diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 1c770a2370..3fea753f06 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -122,7 +122,7 @@ class NgraphOperator { // get ngraph input and define ngraph input parameters void GetNgInputShape(std::shared_ptr op); // Call ngraph bridge to map ops - void BuildNgNode(); + void BuildNgNodes(); // get the ngraph input and output var list void BuildNgIO(); // build ngraph function call @@ -301,7 +301,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr op) { } } -void NgraphOperator::BuildNgNode() { +void NgraphOperator::BuildNgNodes() { for (auto& var_name : var_out_) { if (var_node_map_->find(var_name) == var_node_map_->end()) { auto* var = scope_.FindVar(var_name); @@ -319,7 +319,7 @@ void NgraphOperator::BuildNgNode() { paddle::framework::NgraphBridge ngb(var_node_map_); for (auto& op : fused_ops_) { - ngb.BuildNgGraph(op); + ngb.BuildNgNode(op); } } @@ -396,7 +396,7 @@ void NgraphOperator::BuildNgIO() { } void NgraphOperator::BuildNgFunction() { - BuildNgNode(); + BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; ngraph::op::ParameterVector func_inputs; From 86ae32fbd81abf91646f976379a3fd6a89e448d9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 30 Nov 2018 10:26:15 +0800 Subject: [PATCH 046/719] Stablize decorator test test=develop --- python/paddle/reader/tests/decorator_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py index b9af8348e1..a9dddbbcc8 100644 --- a/python/paddle/reader/tests/decorator_test.py +++ b/python/paddle/reader/tests/decorator_test.py @@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase): for idx, i in enumerate(b()): elapsed_time = time.time() - last_time if i == 0: - time.sleep(0.3) + time.sleep(1) else: # read time should be short, meaning already buffered. - self.assertLess(elapsed_time, 0.05) + self.assertLess(elapsed_time, 0.08) last_time = time.time() From a770d5c9db32b2d5af56f76c1e67e28e24803c9a Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 30 Nov 2018 05:40:20 +0000 Subject: [PATCH 047/719] fix error don't interupt shell , test=develop --- paddle/scripts/paddle_build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index bdc4210640..62a2b08a8a 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -442,6 +442,7 @@ EOF make install -j 8 if [ "$1" == "cp27-cp27m" ]; then pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + set -e python -c "import paddle.fluid" elif [ "$1" == "cp35-cp35m" ]; then pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl From 3df05389407cd0af09e1a2fc9cfde633aec81070 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Fri, 30 Nov 2018 08:04:38 +0000 Subject: [PATCH 048/719] replace -100 to kIgnoreIndex --- .../operators/sigmoid_cross_entropy_with_logits_op.cc | 11 ++++++----- python/paddle/fluid/layers/nn.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 368988d60d..14746fa951 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -18,6 +18,7 @@ namespace paddle { namespace operators { using framework::Tensor; +const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: @@ -100,11 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker AddOutput("Out", "(Tensor, default Tensor), a 2-D tensor with shape N x D " " of elementwise logistic losses."); - AddAttr( - "ignore_index", - "(int, default -100), Specifies a target value that is ignored and" - "does not contribute to the input gradient.") - .SetDefault(-100); + AddAttr("ignore_index", + "(int, default kIgnoreIndex), Specifies a target value that " + "is ignored and" + "does not contribute to the input gradient.") + .SetDefault(kIgnoreIndex); AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 38da9173cc..f746e4424d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -170,6 +170,8 @@ __all__ = [ 'bilinear_tensor_product', ] +kIgnoreIndex = -100 + def fc(input, size, @@ -1103,7 +1105,7 @@ def dropout(x, return out -def cross_entropy(input, label, soft_label=False, ignore_index=-100): +def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): """ **Cross Entropy Layer** @@ -4796,7 +4798,7 @@ def multiplex(inputs, index): def softmax_with_cross_entropy(logits, label, soft_label=False, - ignore_index=-100, + ignore_index=kIgnoreIndex, numeric_stable_mode=False, return_softmax=False): """ @@ -7892,7 +7894,10 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): @templatedoc() -def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-100, name=None): +def sigmoid_cross_entropy_with_logits(x, + label, + ignore_index=kIgnoreIndex, + name=None): """ ${comment} From 126e18c1e893d6129150a6c0af16552c1ed66b54 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Fri, 30 Nov 2018 09:12:44 +0000 Subject: [PATCH 049/719] test=develop --- python/paddle/fluid/layers/nn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f746e4424d..29b05d2eea 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1152,7 +1152,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): labels. Default: `False`. ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient. Only valid - if soft_label is set to False. Default: -100 + if soft_label is set to False. Default: kIgnoreIndex Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. @@ -4856,7 +4856,7 @@ def softmax_with_cross_entropy(logits, labels as soft labels. By default, `soft_label` is set to False. ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient. Only valid - if soft_label is set to False. Default: -100 + if soft_label is set to False. Default: kIgnoreIndex numeric_stable_mode (bool): A flag to indicate whether to use a more numerically stable algorithm. Only valid when soft_label is False and GPU is used. From 679d8fc6fe0349c7438cbe45f4d4a1deba52837e Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 30 Nov 2018 18:52:50 +0800 Subject: [PATCH 050/719] rename op name test=develop --- paddle/fluid/operators/lstm_cudnn_op.cc | 8 ++++---- paddle/fluid/operators/lstm_cudnn_op.cu.cc | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/lstm_cudnn_op.cc b/paddle/fluid/operators/lstm_cudnn_op.cc index ca60fb4b0b..c9a4a31738 100644 --- a/paddle/fluid/operators/lstm_cudnn_op.cc +++ b/paddle/fluid/operators/lstm_cudnn_op.cc @@ -205,14 +205,14 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, +REGISTER_OPERATOR(lstm_cudnn, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); +REGISTER_OPERATOR(lstm_cudnn_grad, ops::CudnnLSTMGradOp); REGISTER_OP_CPU_KERNEL( - cudnn_lstm, + lstm_cudnn, ops::CudnnLSTMKernel); REGISTER_OP_CPU_KERNEL( - cudnn_lstm_grad, + lstm_cudnn_grad, ops::CudnnLSTMGradKernel); diff --git a/paddle/fluid/operators/lstm_cudnn_op.cu.cc b/paddle/fluid/operators/lstm_cudnn_op.cu.cc index 7a67bbe539..353ab17597 100644 --- a/paddle/fluid/operators/lstm_cudnn_op.cu.cc +++ b/paddle/fluid/operators/lstm_cudnn_op.cu.cc @@ -487,8 +487,8 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - cudnn_lstm, + lstm_cudnn, ops::CudnnLSTMGPUKernel); REGISTER_OP_CUDA_KERNEL( - cudnn_lstm_grad, + lstm_cudnn_grad, ops::CudnnLSTMGPUGradKernel); From af8c2cec136ce0893d4a35155f6760f05109856a Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 30 Nov 2018 20:09:34 +0800 Subject: [PATCH 051/719] fix operator.cmake test=develop --- .../fluid/operators/{lstm_cudnn_op.cc => cudnn_lstm_op.cc} | 6 +++--- .../operators/{lstm_cudnn_op.cu.cc => cudnn_lstm_op.cu.cc} | 6 +++--- paddle/fluid/operators/{lstm_cudnn_op.h => cudnn_lstm_op.h} | 0 3 files changed, 6 insertions(+), 6 deletions(-) rename paddle/fluid/operators/{lstm_cudnn_op.cc => cudnn_lstm_op.cc} (98%) rename paddle/fluid/operators/{lstm_cudnn_op.cu.cc => cudnn_lstm_op.cu.cc} (99%) rename paddle/fluid/operators/{lstm_cudnn_op.h => cudnn_lstm_op.h} (100%) diff --git a/paddle/fluid/operators/lstm_cudnn_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc similarity index 98% rename from paddle/fluid/operators/lstm_cudnn_op.cc rename to paddle/fluid/operators/cudnn_lstm_op.cc index c9a4a31738..86632fc9fb 100644 --- a/paddle/fluid/operators/lstm_cudnn_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/lstm_cudnn_op.h" +#include "paddle/fluid/operators/cudnn_lstm_op.h" #include #ifdef PADDLE_WITH_CUDA @@ -205,12 +205,12 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(lstm_cudnn, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, +REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(lstm_cudnn_grad, ops::CudnnLSTMGradOp); REGISTER_OP_CPU_KERNEL( - lstm_cudnn, + cudnn_lstm, ops::CudnnLSTMKernel); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/lstm_cudnn_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc similarity index 99% rename from paddle/fluid/operators/lstm_cudnn_op.cu.cc rename to paddle/fluid/operators/cudnn_lstm_op.cu.cc index 353ab17597..811975a9f3 100644 --- a/paddle/fluid/operators/lstm_cudnn_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/lstm_cudnn_op.h" +#include "paddle/fluid/operators/cudnn_lstm_op.h" #include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { @@ -487,8 +487,8 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - lstm_cudnn, + cudnn_lstm, ops::CudnnLSTMGPUKernel); REGISTER_OP_CUDA_KERNEL( - lstm_cudnn_grad, + cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel); diff --git a/paddle/fluid/operators/lstm_cudnn_op.h b/paddle/fluid/operators/cudnn_lstm_op.h similarity index 100% rename from paddle/fluid/operators/lstm_cudnn_op.h rename to paddle/fluid/operators/cudnn_lstm_op.h From 3f4aca618f4ef0b87881c4a7ae62af2490e35780 Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 30 Nov 2018 21:03:28 +0800 Subject: [PATCH 052/719] code refine test=develop --- paddle/fluid/operators/cudnn_lstm_op.cc | 26 ++++++------- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 17 ++++---- paddle/fluid/operators/cudnn_lstm_op.h | 45 ---------------------- 3 files changed, 20 insertions(+), 68 deletions(-) delete mode 100644 paddle/fluid/operators/cudnn_lstm_op.h diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc index 86632fc9fb..e63d57be57 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -12,12 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cudnn_lstm_op.h" #include - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cudnn_helper.h" -#endif +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -201,18 +197,22 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { } }; +template +class NotImpleKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW( + "CPU is not support for this kernel now. Will be add in the future"); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(lstm_cudnn_grad, ops::CudnnLSTMGradOp); - -REGISTER_OP_CPU_KERNEL( - cudnn_lstm, - ops::CudnnLSTMKernel); +REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); -REGISTER_OP_CPU_KERNEL( - lstm_cudnn_grad, - ops::CudnnLSTMGradKernel); +REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel); +REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 811975a9f3..cad62de754 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cudnn_lstm_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { @@ -246,7 +247,7 @@ struct CudnnRNNCache { } }; -template +template class CudnnLSTMGPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -343,7 +344,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { } }; -template +template class CudnnLSTMGPUGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -380,7 +381,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { auto init_c_dims = init_c->dims(); in_grad->mutable_data(ctx.GetPlace()); weight_grad->mutable_data(ctx.GetPlace()); - math::SetConstant zero; + math::SetConstant zero; zero(dev_ctx, in_grad, static_cast(0.0)); zero(dev_ctx, weight_grad, static_cast(0.0)); @@ -486,9 +487,5 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cudnn_lstm, - ops::CudnnLSTMGPUKernel); -REGISTER_OP_CUDA_KERNEL( - cudnn_lstm_grad, - ops::CudnnLSTMGPUGradKernel); +REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel); +REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/cudnn_lstm_op.h deleted file mode 100644 index fc329cc239..0000000000 --- a/paddle/fluid/operators/cudnn_lstm_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" - -namespace paddle { -namespace operators { - -using LoDTensor = framework::LoDTensor; -using Tensor = framework::Tensor; - -template -class CudnnLSTMKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW( - "CPU is not support for this kernel now. Will be add in the future"); - } -}; - -template -class CudnnLSTMGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override {} -}; - -} // namespace operators -} // namespace paddle From a6aa782ee5b0bfdb88c3b00fccbb72a6ae904195 Mon Sep 17 00:00:00 2001 From: phlrain Date: Fri, 30 Nov 2018 21:49:14 +0800 Subject: [PATCH 053/719] add unitest --- .../tests/unittests/test_lstm_cudnn_op.py | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py new file mode 100644 index 0000000000..2741bf167b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -0,0 +1,154 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest + + +def lstm_naive( + input, + w, ): + seq_len, batch_size, hidden_size = input.shape + + offset = 0 + wi = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wf = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wc = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wo = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + ri = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + rf = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + rc = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + ro = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + + bi_1 = w[offset:offset + hidden_size] + offset += hidden_size + bf_1 = w[offset:offset + hidden_size] + offset += hidden_size + bc_1 = w[offset:offset + hidden_size] + offset += hidden_size + bo_1 = w[offset:offset + hidden_size] + offset += hidden_size + + bi_2 = w[offset:offset + hidden_size] + offset += hidden_size + bf_2 = w[offset:offset + hidden_size] + offset += hidden_size + bc_2 = w[offset:offset + hidden_size] + offset += hidden_size + bo_2 = w[offset:offset + hidden_size] + + def sigmoid(x): + return 1.0 / (1.0 + np.exp(-x)) + + def tanh(x): + return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) + + output = [] + pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype) + pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype) + + for i in range(seq_len): + emb_1 = input[i] + + input_gate = sigmoid( + np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2) + forget_gate = sigmoid( + np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2) + output_gate = sigmoid( + np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2) + c_t_temp = tanh( + np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2) + new_c = input_gate * c_t_temp + forget_gate * pre_c + new_h = output_gate * tanh(new_c) + + pre_h = new_h + pre_c = new_c + + output.append(new_h) + + output = np.concatenate(output, -1) + output = output.reshape((batch_size, -1, hidden_size)) + + output = output.transpose((1, 0, 2)) + + return output + + +class TestCUDNNLstmOp(OpTest): + def setUp(self): + self.op_type = "cudnn_lstm" + self.dtype = np.float32 + + num_steps = 50 + batch_size = 20 + hidden_size = 200 + + input_weight_size = (hidden_size * hidden_size) * 4 + hidden_weight_size = (hidden_size * hidden_size) * 4 + weight_size = input_weight_size + hidden_weight_size + weight_size += hidden_size * 8 + + input = np.random.random( + (num_steps, batch_size, hidden_size)).astype(self.dtype) + flat_w = np.random.random((weight_size)).astype(self.dtype) + + output = lstm_naive(input, flat_w) + + init_h = np.zeros((batch_size, hidden_size), dtype=np.float32) + init_c = np.zeros((batch_size, hidden_size), dtype=np.float32) + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'W': OpTest.np_dtype_to_fluid_dtype(flat_w), + 'InitH': OpTest.np_dtype_to_fluid_dtype(init_h), + 'InitC': OpTest.np_dtype_to_fluid_dtype(init_c), + } + self.attrs = { + 'max_len': num_steps, + 'dropout_prob': 0.0, + 'is_bidirec': False, + 'input_size': hidden_size, + 'hidden_size': hidden_size, + 'num_layers': 1, + } + self.outputs = {'Out': output} + + def test_grad_with_place(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, atol=1e-5) + + def test_output_with_place(self): + place = core.CUDAPlace(0) + self.check_output_with_place( + place, atol=1e-5, no_check_set=['last_h', 'last_c']) From bc7db6cec944a8ab5099dfd8d0e03ebd5fa1eb60 Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Fri, 30 Nov 2018 15:25:44 +0100 Subject: [PATCH 054/719] Fix for accuracy problem for inplace operators when MKL-DNN mode is enabled test=develop --- paddle/fluid/framework/operator.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2260353af7..fc5036886a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -749,7 +749,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - if (!transfered_inplace_vars.empty()) { + if (run_by_executor_ && !transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); } @@ -771,6 +771,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } + void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { From 3c239cd640aca1fa8da71a9cdc319b8b4e4fb36c Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Sat, 1 Dec 2018 13:10:21 +0800 Subject: [PATCH 055/719] pslib --- CMakeLists.txt | 1 + cmake/external/pslib.cmake | 76 ++++++++++++++++++++++++ paddle/fluid/framework/async_executor.cc | 1 + 3 files changed, 78 insertions(+) create mode 100644 cmake/external/pslib.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index efa68c9ba2..5251fe286f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,6 +186,7 @@ endif() ######################################################################################## include(external/mklml) # download mklml package +include(external/pslib) # download mklml package include(external/xbyak) # download xbyak package include(external/libxsmm) # download, build, install libxsmm include(external/zlib) # download, build, install zlib diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake new file mode 100644 index 0000000000..812af5efa2 --- /dev/null +++ b/cmake/external/pslib.cmake @@ -0,0 +1,76 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB}) + return() +ENDIF(NOT ${WITH_PSLIB}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB in Paddle yet." + "Force WITH_PSLIB=OFF") + SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_PROJECT "extern_pslib") +IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_VER "pslib" CACHE STRING "" FORCE) #todo pslib version + SET(PSLIB_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib url +ENDIF() +MESSAGE(STATUS "PSLIB_VER: ${PSLIB_VER}, PSLIB_URL: ${PSLIB_URL}") +SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") +SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}") +SET(PSLIB_DST_DIR "pslib") +SET(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR}) +SET(PSLIB_ROOT ${PSLIB_INSTALL_DIR}) +SET(PSLIB_INC_DIR ${PSLIB_ROOT}/include) +SET(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib) +SET(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so) +SET(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) + +FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_VER}/include ${PSLIB_VER}/lib \n" + " DESTINATION ${PSLIB_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_VER}.tar.gz + && tar zxvf ${PSLIB_VER}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) +ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) +LIST(APPEND external_project_dependencies pslib) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib) +ENDIF() diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index afb2dd2f06..aa76e03e83 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" +#include "pslib.h" namespace paddle { namespace framework { From 0e4709daddaf76e71a2de3f7490184453b2c1e17 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sat, 1 Dec 2018 13:14:03 +0800 Subject: [PATCH 056/719] add mpi4py helper --- python/paddle/fluid/distributed/helper.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 python/paddle/fluid/distributed/helper.py diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py new file mode 100644 index 0000000000..8e079b1e8d --- /dev/null +++ b/python/paddle/fluid/distributed/helper.py @@ -0,0 +1,20 @@ +from mpi4py import MPI + +class MPIHelper(object): + def __init__(self): + self.comm = MPI.COMM_WORLD + + def get_rank(self): + return self.comm.Get_rank() + + def get_size(self): + return self.comm.Get_size() + + def get_ip(self): + import socket + local_ip = socket.gethostbyname(socket.gethostname()) + return local_ip + + def get_hostname(self): + import socket + return socket.gethostname() From 038346c0c2053bbc0b051e7bb48de42d61af6958 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Sat, 1 Dec 2018 13:52:02 +0800 Subject: [PATCH 057/719] libmct --- cmake/external/libmct.cmake | 76 +++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 cmake/external/libmct.cmake diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake new file mode 100644 index 0000000000..351806f6e1 --- /dev/null +++ b/cmake/external/libmct.cmake @@ -0,0 +1,76 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_LIBMCT}) + return() +ENDIF(NOT ${WITH_LIBMCT}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with LIBMCT in Paddle yet." + "Force WITH_LIBMCT=OFF") + SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(LIBMCT_PROJECT "extern_libmct") +IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(LIBMCT_VER "libmct" CACHE STRING "" FORCE) #todo libmct version + SET(LIBMCT_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${LIBMCT_VER}.tar.gz" CACHE STRING "" FORCE) #todo libmct url +ENDIF() +MESSAGE(STATUS "LIBMCT_VER: ${LIBMCT_VER}, LIBMCT_URL: ${LIBMCT_URL}") +SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct") +SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}") +SET(LIBMCT_DST_DIR "libmct") +SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) +SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) +SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) +SET(LIBMCT_LIB_DIR ${LIBMCT_ROOT}/lib) +SET(LIBMCT_LIB ${LIBMCT_LIB_DIR}/libps.so) +SET(LIBMCT_IOMP_LIB ${LIBMCT_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") + +INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) + +FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(LIBMCT)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${LIBMCT_VER}/include ${LIBMCT_VER}/lib \n" + " DESTINATION ${LIBMCT_DST_DIR})\n") + +ExternalProject_Add( + ${LIBMCT_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LIBMCT_SOURCE_DIR} + DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_VER}.tar.gz + && tar zxvf ${LIBMCT_VER}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} +) + +ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET libmct PROPERTY IMPORTED_LOCATION ${LIBMCT_LIB}) +ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) +LIST(APPEND external_project_dependencies libmct) + +IF(WITH_C_API) + INSTALL(FILES ${LIBMCT_LIB} ${LIBMCT_IOMP_LIB} DESTINATION lib) +ENDIF() From 4798a8c7b848891c18cd5b23e8023b88d9f32643 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Sat, 1 Dec 2018 14:51:40 +0800 Subject: [PATCH 058/719] pslib_brpc --- cmake/external/pslib_brpc.cmake | 76 +++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 cmake/external/pslib_brpc.cmake diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake new file mode 100644 index 0000000000..7b4beeae65 --- /dev/null +++ b/cmake/external/pslib_brpc.cmake @@ -0,0 +1,76 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB_BRPC}) + return() +ENDIF(NOT ${WITH_PSLIB_BRPC}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." + "Force WITH_PSLIB_BRPC=OFF") + SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") +IF((NOT DEFINED PSLIB_BRPC_VER) OR (NOT DEFINED PSLIB_BRPC_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_BRPC_VER "pslib_brpc" CACHE STRING "" FORCE) #todo pslib version + SET(PSLIB_BRPC_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_BRPC_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib_brpc url +ENDIF() +MESSAGE(STATUS "PSLIB_BRPC_VER: ${PSLIB_BRPC_VER}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc") +SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}") +SET(PSLIB_BRPC_DST_DIR "pslib_brpc") +SET(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) +SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) +SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) +SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) +SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libps.so) +SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) + +FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB_BRPC)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_BRPC_VER}/include ${PSLIB_BRPC_VER}/lib \n" + " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_BRPC_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_VER}.tar.gz + && tar zxvf ${PSLIB_BRPC_VER}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) +ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) +LIST(APPEND external_project_dependencies pslib_brpc) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib) +ENDIF() From b65722d3cf0ac525eaf39fd026013f1aaf718531 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 1 Dec 2018 16:03:42 +0800 Subject: [PATCH 059/719] fix uni test; test=develop --- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 6 --- python/paddle/fluid/layers/nn.py | 13 +++-- .../paddle/fluid/tests/unittests/op_test.py | 9 ++++ .../tests/unittests/test_lstm_cudnn_op.py | 53 ++++++++++++++----- 4 files changed, 54 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index cad62de754..e01070c7b8 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -279,12 +279,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { int num_layers = ctx.Attr("num_layers"); bool is_test = ctx.Attr("is_test"); - /* - if (is_test) { - TensorCopy(*x, ctx.GetPlace(), out); - return; - }*/ - auto &dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto *cache_var = ctx.InputVar("Cache"); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f9e3da68d7..dbc39afccb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -477,12 +477,10 @@ def lstm(input, init_h, init_c, max_len, - dropout_prob, - input_size, hidden_size, num_layers, + dropout_prob=0.0, is_bidirec=False, - dtype='float32', is_test=False, name=None, default_initializer=None, @@ -531,13 +529,11 @@ def lstm(input, This is a tensor with shape ( num_layers x batch_size x hidden_size ) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len - dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps - There is NO dropout work on rnn output of the last RNN layers - input_size (int): hidden size of the input tensor hidden_size (int): hidden size of the LSTM num_layers (int): total layers number of the LSTM + dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps + There is NO dropout work on rnn output of the last RNN layers is_bidirec (bool): If it is bidirectional - dtype (str): Data type. Choices = ["float32", "float64"], default "float32". is_test (bool): If it is in test phrase name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -577,6 +573,9 @@ def lstm(input, helper = LayerHelper('cudnn_lstm', **locals()) + dtype = input.dtype + input_shape = list(input.shape) + input_size = input_shape[-1] weight_size = 0 for i in range(num_layers): if i == 0: diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 271b9c740f..0200d74136 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -216,6 +216,15 @@ class OpTest(unittest.TestCase): self.dtype) outputs = append_input_output(block, op_proto, self.outputs, False, self.dtype) + + if hasattr(self, "cache_name_list"): + for name in self.cache_name_list: + inputs[name] = block.create_var( + name=name, + persistable=True, + type=core.VarDesc.VarType.RAW, + stop_gradient=True) + op = block.append_op( type=self.op_type, inputs=inputs, diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py index 2741bf167b..8d313970cc 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -19,6 +19,11 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest +import paddle.fluid as fluid + +SIGMOID_THRESHOLD_MIN = -40.0 +SIGMOID_THRESHOLD_MAX = 13.0 +EXP_MAX_INPUT = 40.0 def lstm_naive( @@ -70,10 +75,15 @@ def lstm_naive( bo_2 = w[offset:offset + hidden_size] def sigmoid(x): - return 1.0 / (1.0 + np.exp(-x)) + y = np.copy(x) + y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN + y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX + return 1. / (1. + np.exp(-y)) def tanh(x): - return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) + y = -2. * x + y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT + return (2. / (1. + np.exp(y))) - 1. output = [] pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype) @@ -103,7 +113,7 @@ def lstm_naive( output = output.transpose((1, 0, 2)) - return output + return output, pre_h, pre_c class TestCUDNNLstmOp(OpTest): @@ -120,20 +130,32 @@ class TestCUDNNLstmOp(OpTest): weight_size = input_weight_size + hidden_weight_size weight_size += hidden_size * 8 - input = np.random.random( - (num_steps, batch_size, hidden_size)).astype(self.dtype) - flat_w = np.random.random((weight_size)).astype(self.dtype) + input = np.random.uniform( + low=-0.1, high=0.1, size=(num_steps, batch_size, + hidden_size)).astype(self.dtype) + flat_w = np.random.uniform( + low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype) - output = lstm_naive(input, flat_w) + output, last_hidden, last_cell = lstm_naive(input, flat_w) init_h = np.zeros((batch_size, hidden_size), dtype=np.float32) init_c = np.zeros((batch_size, hidden_size), dtype=np.float32) + scope = core.Scope() + program = fluid.Program() + block = program.global_block() + + cache_temp = block.create_var( + name="Cache", + persistable=True, + type=core.VarDesc.VarType.RAW, + stop_gradient=True) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), 'W': OpTest.np_dtype_to_fluid_dtype(flat_w), 'InitH': OpTest.np_dtype_to_fluid_dtype(init_h), 'InitC': OpTest.np_dtype_to_fluid_dtype(init_c), } + self.cache_name_list = ['Cache'] self.attrs = { 'max_len': num_steps, 'dropout_prob': 0.0, @@ -142,13 +164,16 @@ class TestCUDNNLstmOp(OpTest): 'hidden_size': hidden_size, 'num_layers': 1, } - self.outputs = {'Out': output} - - def test_grad_with_place(self): - place = core.CUDAPlace(0) - self.check_grad_with_place(place, atol=1e-5) + self.outputs = { + 'Out': output, + "last_h": last_hidden, + 'last_c': last_cell + } def test_output_with_place(self): place = core.CUDAPlace(0) - self.check_output_with_place( - place, atol=1e-5, no_check_set=['last_h', 'last_c']) + self.check_output_with_place(place, atol=1e-5) + + +if __name__ == '__main__': + unittest.main() From 25df78eaf31b4b4fe1e208f3c70a9319e9fd4830 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 1 Dec 2018 16:09:26 +0800 Subject: [PATCH 060/719] fix api spec; test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 10ac9fe070..8f6797429c 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -194,7 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0)) +paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) From 52a0be7bb437e574d7fda8d322c816e91029e438 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sat, 1 Dec 2018 13:54:44 +0800 Subject: [PATCH 061/719] add mct into CMakeLists.txt --- CMakeLists.txt | 5 ++++- paddle/fluid/framework/async_executor.h | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5251fe286f..8c929396ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,7 +186,6 @@ endif() ######################################################################################## include(external/mklml) # download mklml package -include(external/pslib) # download mklml package include(external/xbyak) # download xbyak package include(external/libxsmm) # download, build, install libxsmm include(external/zlib) # download, build, install zlib @@ -217,6 +216,9 @@ include(external/warpctc) # download, build, install warpctc include(cupti) include(external/gzstream) endif (NOT WIN32) +include(external/libmct) +include(external/pslib_brpc) +include(external/pslib) if(WITH_DISTRIBUTE) if(WITH_GRPC) @@ -277,6 +279,7 @@ set(EXTERNAL_LIBS protobuf zlib ${PYTHON_LIBRARIES} + pslib ) if(WITH_AMD_GPU) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index f4d2a79ac5..6aa59c89dc 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -40,6 +40,9 @@ class AsyncExecutor { const int thread_num, const std::vector& fetch_names, const bool debug = false); + void ConfigServer() {} + void ConfigWorker() {} + void StartServer() {} private: void CreateThreads(ExecutorThreadWorker* worker, From 0b8a377d9cbd02c8c0d3824ad9b37d9a52ba8822 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 1 Dec 2018 18:22:05 +0800 Subject: [PATCH 062/719] fix cpu test; test=develop --- python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py index 8d313970cc..e3de9f992b 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -171,9 +171,14 @@ class TestCUDNNLstmOp(OpTest): } def test_output_with_place(self): + if not self.testcuda(): + pass place = core.CUDAPlace(0) self.check_output_with_place(place, atol=1e-5) + def testcuda(self): + return core.is_compiled_with_cuda() + if __name__ == '__main__': unittest.main() From 24fa1f4b8c377dc446857a59d88bf72b1c3a5a67 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 1 Dec 2018 18:39:13 +0800 Subject: [PATCH 063/719] fix test uni; test=develop --- python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py index e3de9f992b..9c8e7beae2 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -171,10 +171,9 @@ class TestCUDNNLstmOp(OpTest): } def test_output_with_place(self): - if not self.testcuda(): - pass - place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-5) + if self.testcuda(): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) def testcuda(self): return core.is_compiled_with_cuda() From 61ae88b7604a9ed4edb77700efe932f79c0f49ef Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Sat, 1 Dec 2018 19:24:36 +0800 Subject: [PATCH 064/719] Revert "Fix for accuracy problem for inplace operators when MKL-DNN mode is enabled" --- paddle/fluid/framework/operator.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 50a748ba02..8bfdf38912 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -754,7 +754,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - if (run_by_executor_ && !transfered_inplace_vars.empty()) { + if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); } @@ -776,7 +776,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } - void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { From 1d63b06bd738e9b754a006eb23a1a2e064f9504e Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 1 Dec 2018 22:36:37 +0800 Subject: [PATCH 065/719] add grad test unit; test=develop --- python/paddle/fluid/tests/unittests/op_test.py | 13 +++++++++++-- .../fluid/tests/unittests/test_lstm_cudnn_op.py | 15 ++++++++++++--- python/paddle/fluid/tests/unittests/testsuite.py | 7 ++++++- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 0200d74136..76a707efdc 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -437,8 +437,17 @@ class OpTest(unittest.TestCase): op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_outputs = self.outputs if hasattr(self, "outputs") else dict() op_attrs = self.attrs if hasattr(self, "attrs") else dict() - self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs, - op_attrs) + + cache_list = None + if hasattr(self, "cache_name_list"): + cache_list = self.cache_name_list + self.op = create_op( + self.scope, + self.op_type, + op_inputs, + op_outputs, + op_attrs, + cache_list=cache_list) if no_grad_set is None: no_grad_set = set() diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py index 9c8e7beae2..0e9e2e8429 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -121,9 +121,9 @@ class TestCUDNNLstmOp(OpTest): self.op_type = "cudnn_lstm" self.dtype = np.float32 - num_steps = 50 - batch_size = 20 - hidden_size = 200 + num_steps = 20 + batch_size = 5 + hidden_size = 20 input_weight_size = (hidden_size * hidden_size) * 4 hidden_weight_size = (hidden_size * hidden_size) * 4 @@ -175,6 +175,15 @@ class TestCUDNNLstmOp(OpTest): place = core.CUDAPlace(0) self.check_output_with_place(place, atol=1e-5) + def test_grad_with_place(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + set(['Input', 'W', 'InitH', 'InitC']), + ['Out', 'last_h', 'last_c'], + max_relative_error=0.02) + def testcuda(self): return core.is_compiled_with_cuda() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index 34fbb1b549..dc3b2cb8bc 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -20,7 +20,7 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator -def create_op(scope, op_type, inputs, outputs, attrs): +def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None): kwargs = dict() op_maker = core.op_proto_and_checker_maker @@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs): __create_var__(in_name, sub_in_name) else: __create_var__(in_name, in_name) + if cache_list != None and isinstance(cache_list, list): + for name in cache_list: + kwargs[name] = [] + scope.var(name) + kwargs[name].append(name) for out_name, out_dup in Operator.get_op_outputs(op_type): if out_name in outputs: From d89108766c315c2387e60db208ad842c76fa3313 Mon Sep 17 00:00:00 2001 From: barrierye Date: Sun, 2 Dec 2018 14:22:59 +0800 Subject: [PATCH 066/719] update CheckFile function in data_feed for ignore the space at the end of each line of data(for example, it may be added '\t' character to the end of the reduce task output when processes data by hadoop, which does not affect the correctness of the data). test=develop --- paddle/fluid/framework/data_feed.cc | 43 +++++++++++++++-------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 851c7eda89..5fb141f3c1 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -200,22 +200,22 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { for (size_t i = 0; i < all_slots_.size(); ++i) { int num = strtol(endptr, &endptr, 10); if (num < 0) { - VLOG(1) << "error: the number of ids is a negative number: " << num; - VLOG(1) << "please check line<" << instance_cout << "> in file<" + VLOG(0) << "error: the number of ids is a negative number: " << num; + VLOG(0) << "please check line<" << instance_cout << "> in file<" << filename << ">"; return false; } else if (num == 0) { - VLOG(1) + VLOG(0) << "error: the number of ids can not be zero, you need " "padding it in data generator; or if there is something wrong" " with the data, please check if the data contains unresolvable " "characters."; - VLOG(1) << "please check line<" << instance_cout << "> in file<" + VLOG(0) << "please check line<" << instance_cout << "> in file<" << filename << ">"; return false; } else if (errno == ERANGE || num > INT_MAX) { - VLOG(1) << "error: the number of ids greater than INT_MAX"; - VLOG(1) << "please check line<" << instance_cout << "> in file<" + VLOG(0) << "error: the number of ids greater than INT_MAX"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" << filename << ">"; return false; } @@ -223,15 +223,15 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { for (int i = 0; i < num; ++i) { strtof(endptr, &endptr); if (errno == ERANGE) { - VLOG(1) << "error: the value is out of the range of " + VLOG(0) << "error: the value is out of the range of " "representable values for float"; - VLOG(1) << "please check line<" << instance_cout << "> in file<" + VLOG(0) << "please check line<" << instance_cout << "> in file<" << filename << ">"; return false; } if (i + 1 != num && endptr - str == len) { - VLOG(1) << "error: there is a wrong with the number of ids."; - VLOG(1) << "please check line<" << instance_cout << "> in file<" + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" << filename << ">"; return false; } @@ -240,30 +240,33 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { for (int i = 0; i < num; ++i) { strtoull(endptr, &endptr, 10); if (errno == ERANGE) { - VLOG(1) << "error: the value is out of the range of " + VLOG(0) << "error: the value is out of the range of " "representable values for uint64_t"; - VLOG(1) << "please check line<" << instance_cout << "> in file<" + VLOG(0) << "please check line<" << instance_cout << "> in file<" << filename << ">"; return false; } if (i + 1 != num && endptr - str == len) { - VLOG(1) << "error: there is a wrong with the number of ids."; - VLOG(1) << "please check line<" << instance_cout << "> in file<" + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" << filename << ">"; return false; } } } else { - VLOG(1) << "error: this type<" << all_slots_type_[i] + VLOG(0) << "error: this type<" << all_slots_type_[i] << "> is not supported"; return false; } } - if (endptr - str != len) { - VLOG(1) << "error: there is some data at the end of the line."; - VLOG(1) << "please check line<" << instance_cout << "> in file<" - << filename << ">"; - return false; + while (endptr - str != len) { + if (!isspace(*(endptr++))) { + VLOG(0) + << "error: there is some extra characters at the end of the line."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } } } VLOG(3) << "instances cout: " << instance_cout; From d62a3dd72d847d7d37031ae6560caca6a020c15c Mon Sep 17 00:00:00 2001 From: barrierye Date: Sun, 2 Dec 2018 21:28:33 +0800 Subject: [PATCH 067/719] add the comment for CheckFile function. test=develop --- paddle/fluid/framework/data_feed.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 5fb141f3c1..ae52b5dfca 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -259,6 +259,13 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { return false; } } + // It may be added '\t' character to the end of the output of reduce + // task when processes data by Hadoop(when the output of the reduce + // task of Hadoop has only one field, it will add a '\t' at the end + // of the line by default), which does not affect the correctness of + // the data. Therefore, it should be judged that the data is not + // normal when the end of each line of data contains characters + // which are not spaces. while (endptr - str != len) { if (!isspace(*(endptr++))) { VLOG(0) From 08233beed7804f4a5e6ef17d84f919439f95a933 Mon Sep 17 00:00:00 2001 From: barrierye Date: Sun, 2 Dec 2018 21:38:22 +0800 Subject: [PATCH 068/719] add the comment for CheckFile function. test=develop --- paddle/fluid/framework/data_feed.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index ae52b5dfca..291d8ffc3c 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -262,10 +262,11 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { // It may be added '\t' character to the end of the output of reduce // task when processes data by Hadoop(when the output of the reduce // task of Hadoop has only one field, it will add a '\t' at the end - // of the line by default), which does not affect the correctness of - // the data. Therefore, it should be judged that the data is not - // normal when the end of each line of data contains characters - // which are not spaces. + // of the line by default, and you can use this option to avoid it: + // `-D mapred.textoutputformat.ignoreseparator=true`), which does + // not affect the correctness of the data. Therefore, it should be + // judged that the data is not normal when the end of each line of + // data contains characters which are not spaces. while (endptr - str != len) { if (!isspace(*(endptr++))) { VLOG(0) From dac92e560cadfd641057662b3adc4a1c6c6f6735 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 13 Nov 2018 10:02:12 +0800 Subject: [PATCH 069/719] initial commit --- paddle/fluid/CMakeLists.txt | 1 + paddle/fluid/imperative/CMakeLists.txt | 1 + paddle/fluid/imperative/layer.cc | 19 ++++++++++ paddle/fluid/imperative/layer.h | 36 ++++++++++++++++++ paddle/fluid/operators/activation_op.cc | 2 +- paddle/fluid/pybind/CMakeLists.txt | 3 +- paddle/fluid/pybind/imperative.cc | 19 ++++++++++ paddle/fluid/pybind/imperative.h | 37 +++++++++++++++++++ paddle/fluid/pybind/pybind.cc | 6 +++ python/paddle/fluid/__init__.py | 2 + python/paddle/fluid/imperative/__init__.py | 21 +++++++++++ python/paddle/fluid/imperative/layers.py | 25 +++++++++++++ .../fluid/tests/unittests/test_imperative.py | 29 +++++++++++++++ 13 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/imperative/CMakeLists.txt create mode 100644 paddle/fluid/imperative/layer.cc create mode 100644 paddle/fluid/imperative/layer.h create mode 100644 paddle/fluid/pybind/imperative.cc create mode 100644 paddle/fluid/pybind/imperative.h create mode 100644 python/paddle/fluid/imperative/__init__.py create mode 100644 python/paddle/fluid/imperative/layers.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 6b526f0103..595454e90b 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt new file mode 100644 index 0000000000..12dfd6f1e7 --- /dev/null +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(layer SRCS layer.cc) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc new file mode 100644 index 0000000000..e0b6ad91e7 --- /dev/null +++ b/paddle/fluid/imperative/layer.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/layer.h" + +namespace paddle { +namespace imperative {} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h new file mode 100644 index 0000000000..cf6aec60d1 --- /dev/null +++ b/paddle/fluid/imperative/layer.h @@ -0,0 +1,36 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace imperative { + +class TensorFuture { + public: +}; + +class Layer { + public: + virtual ~Layer() {} + + virtual void Forward() { LOG(ERROR) << "at cpp."; } + + virtual void Backward() {} +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 832245371e..cc5f90cd11 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -66,7 +66,7 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const framework::OperatorWithKernel& oper, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + framework::DataLayout layout = framework::ddle/fluid/pybind/CMakeLists.txtDataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d602613fc8..7079f9fe04 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) + if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc new file mode 100644 index 0000000000..af010f09dd --- /dev/null +++ b/paddle/fluid/pybind/imperative.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/imperative.h" + +namespace paddle { +namespace pybind {} // namespace pybind11 +} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h new file mode 100644 index 0000000000..d1b9024990 --- /dev/null +++ b/paddle/fluid/pybind/imperative.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include "paddle/fluid/imperative/layer.h" +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +class PyLayer : public paddle::imperative::Layer { + public: + using paddle::imperative::Layer::Layer; // Inherit constructors + + void Forward() override { + PYBIND11_OVERLOAD(void, Layer, Forward, ); // NOLINT + } + + void Backward() override { + PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT + } +}; + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d297..4d8c0fdb13 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -45,6 +45,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -100,6 +101,11 @@ PYBIND11_MODULE(core, m) { BindException(&m); + py::class_ layer(m, "Layer"); + layer.def(py::init<>()) + .def("forward", &imperative::Layer::Forward) + .def("backward", &imperative::Layer::Backward); + py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a1ffbf4262..78b793c7c3 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,6 +34,7 @@ from . import io from . import evaluator from . import initializer from . import layers +from . import imperative from . import contrib from . import nets from . import optimizer @@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', + 'imperative', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py new file mode 100644 index 0000000000..8ce1dd7aa3 --- /dev/null +++ b/python/paddle/fluid/imperative/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import layers +from .layers import * + +__all__ = [] +__all__ += layers.__all__ diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py new file mode 100644 index 0000000000..ae96a0a6b2 --- /dev/null +++ b/python/paddle/fluid/imperative/layers.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import core + +__all__ = ['PyLayer'] + + +class PyLayer(core.Layer): + def __init__(self): + pass + + def forward(self): + print("at python.") diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py new file mode 100644 index 0000000000..36432d8368 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -0,0 +1,29 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +from paddle.fluid import core + + +class TestImperative(unittest.TestCase): + def test_layer(self): + cl = core.Layer() + cl.forward() + l = fluid.imperative.PyLayer() + l.forward() + + +if __name__ == '__main__': + unittest.main() From a6d23083f0fab8b36bc55b2081dce51c0bf1e9d7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 13 Nov 2018 21:14:05 +0800 Subject: [PATCH 070/719] some tracing test=develop --- paddle/fluid/imperative/CMakeLists.txt | 2 + paddle/fluid/imperative/engine.cc | 53 +++++++++++++++++++ paddle/fluid/imperative/engine.h | 39 ++++++++++++++ paddle/fluid/imperative/layer.h | 4 +- paddle/fluid/imperative/tracer.cc | 19 +++++++ paddle/fluid/imperative/tracer.h | 37 +++++++++++++ paddle/fluid/pybind/imperative.cc | 12 ++++- paddle/fluid/pybind/imperative.h | 6 ++- paddle/fluid/pybind/pybind.cc | 1 + python/paddle/fluid/framework.py | 25 +++++++++ python/paddle/fluid/imperative/__init__.py | 4 ++ python/paddle/fluid/imperative/base.py | 33 ++++++++++++ .../fluid/tests/unittests/test_imperative.py | 9 ++++ 13 files changed, 239 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/imperative/engine.cc create mode 100644 paddle/fluid/imperative/engine.h create mode 100644 paddle/fluid/imperative/tracer.cc create mode 100644 paddle/fluid/imperative/tracer.h create mode 100644 python/paddle/fluid/imperative/base.py diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 12dfd6f1e7..dff80dff0b 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1 +1,3 @@ cc_library(layer SRCS layer.cc) +cc_library(tracer SRCS tracer.cc DEPS proto_desc) +cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc new file mode 100644 index 0000000000..de7ab0e591 --- /dev/null +++ b/paddle/fluid/imperative/engine.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/engine.h" + +#include // NOLINT +#include + +#include "glog/logging.h" + +namespace paddle { +namespace imperative { + +static std::once_flag init_engine; +static Engine* engine; + +class DummyEngine : public Engine { + public: + void Enqueue(Runnable* runnable) override { + queued_runnables_.push_back(runnable); + } + + size_t Size() const override { return queued_runnables_.size(); } + + void Sync() override { + for (Runnable* l : queued_runnables_) { + LOG(INFO) << "running " << reinterpret_cast(l); + } + queued_runnables_.clear(); + } + + private: + std::vector queued_runnables_; +}; + +Engine* GetEngine() { + std::call_once(init_engine, []() { engine = new DummyEngine(); }); + return engine; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h new file mode 100644 index 0000000000..a1dfa5bda3 --- /dev/null +++ b/paddle/fluid/imperative/engine.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace imperative { + +struct Runnable {}; + +class Engine { + public: + virtual ~Engine() {} + + virtual void Enqueue(Runnable* runnable) = 0; + + virtual size_t Size() const = 0; + + virtual void Sync() = 0; +}; + +Engine* GetEngine(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index cf6aec60d1..42cc65ddc9 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -27,9 +27,9 @@ class Layer { public: virtual ~Layer() {} - virtual void Forward() { LOG(ERROR) << "at cpp."; } + virtual void Forward() { LOG(ERROR) << "forward at cpp."; } - virtual void Backward() {} + virtual void Backward() { LOG(ERROR) << "backward at cpp."; } }; } // namespace imperative diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc new file mode 100644 index 0000000000..f64f9e72c4 --- /dev/null +++ b/paddle/fluid/imperative/tracer.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace imperative {} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h new file mode 100644 index 0000000000..91e34a4783 --- /dev/null +++ b/paddle/fluid/imperative/tracer.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/imperative/engine.h" + +namespace paddle { +namespace imperative { + +class Tracer { + public: + Tracer() {} + + void Trace(framework::OpDesc* op_desc) { + LOG(ERROR) << "tracing " << op_desc->Type(); + } + + private: + std::vector runnables_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index af010f09dd..cd97cd6352 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -13,7 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/imperative/tracer.h" namespace paddle { -namespace pybind {} // namespace pybind11 +namespace pybind { + +// Bind Methods +void BindTracer(pybind11::module *m) { + pybind11::class_(*m, "Tracer", "") + .def(pybind11::init<>()) + .def("trace", &imperative::Tracer::Trace); +} + +} // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index d1b9024990..bfab6bd9b9 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -20,9 +20,9 @@ limitations under the License. */ namespace paddle { namespace pybind { -class PyLayer : public paddle::imperative::Layer { +class PyLayer : public imperative::Layer { public: - using paddle::imperative::Layer::Layer; // Inherit constructors + using imperative::Layer::Layer; // Inherit constructors void Forward() override { PYBIND11_OVERLOAD(void, Layer, Forward, ); // NOLINT @@ -33,5 +33,7 @@ class PyLayer : public paddle::imperative::Layer { } }; +void BindTracer(pybind11::module *m); + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 4d8c0fdb13..fa3e383536 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -105,6 +105,7 @@ PYBIND11_MODULE(core, m) { layer.def(py::init<>()) .def("forward", &imperative::Layer::Forward) .def("backward", &imperative::Layer::Backward); + BindTracer(&m); py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b991187d42..525c36702b 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -49,6 +49,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() +_imperative_tracer_ = None + + +def _in_imperative_mode(): + return _imperative_tracer_ is not None + + +def _imperative_tracer(): + return _imperative_tracer_ + class NameScope(object): def __init__(self, name="", parent=None): @@ -1203,6 +1213,12 @@ class Block(object): Returns: Operator: the append Operator. """ + if _in_imperative_mode(): + op_desc = core.OpDesc() + op = Operator(block=self, desc=op_desc, *args, **kwargs) + _imperative_tracer().trace(op.desc) + return + op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) self.ops.append(op) @@ -2208,3 +2224,12 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) + + +@contextlib.contextmanager +def _imperative_guard(): + global _imperative_tracer_ + tmp_trace = _imperative_tracer_ + _imperative_tracer_ = core.Tracer() + yield + _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py index 8ce1dd7aa3..922308b6b1 100644 --- a/python/paddle/fluid/imperative/__init__.py +++ b/python/paddle/fluid/imperative/__init__.py @@ -14,8 +14,12 @@ from __future__ import print_function +from . import base +from .base import * + from . import layers from .layers import * __all__ = [] __all__ += layers.__all__ +__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py new file mode 100644 index 0000000000..900a65a3aa --- /dev/null +++ b/python/paddle/fluid/imperative/base.py @@ -0,0 +1,33 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +from paddle.fluid import core +from paddle.fluid import framework + +__all__ = ['enabled', 'guard'] + + +def enabled(): + return framework._in_imperative_mode() + + +@contextlib.contextmanager +def guard(): + train = framework.Program() + startup = framework.Program() + with framework.program_guard(train, startup): + with framework.unique_name.guard(): + with framework._imperative_guard(): + yield + # TODO: check train, startup not changed. diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 36432d8368..cdd90accc1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -13,6 +13,7 @@ # limitations under the License. import unittest +import sys import paddle.fluid as fluid from paddle.fluid import core @@ -24,6 +25,14 @@ class TestImperative(unittest.TestCase): l = fluid.imperative.PyLayer() l.forward() + def test_imperative_trace(self): + with fluid.imperative.guard(): + self.assertTrue(fluid.imperative.enabled()) + x = fluid.layers.data(name='x', shape=[3, 4], dtype='float32') + x = fluid.layers.relu(x) + x = fluid.layers.elementwise_mul(x, x) + self.assertIsNotNone(x) + if __name__ == '__main__': unittest.main() From b1f6fda5e549d751c55ab8f26b932fd9201417a8 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 19 Nov 2018 15:07:06 +0800 Subject: [PATCH 071/719] run forward --- paddle/fluid/framework/feed_fetch_method.cc | 17 +++++++ paddle/fluid/framework/feed_fetch_method.h | 2 + paddle/fluid/framework/framework.proto | 2 +- paddle/fluid/imperative/layer.h | 15 ++++++- paddle/fluid/pybind/imperative.h | 10 +++-- paddle/fluid/pybind/pybind.cc | 17 ++++++- python/paddle/fluid/framework.py | 16 +++++-- python/paddle/fluid/imperative/layers.py | 33 +++++++++++++- python/paddle/fluid/layer_helper.py | 44 ++++++++++++++----- python/paddle/fluid/layers/nn.py | 3 +- .../fluid/tests/unittests/test_imperative.py | 30 ++++++++++--- 11 files changed, 159 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3e9353f5cf..b13d0d3807 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -53,5 +55,20 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { + Variable* var = scope.FindVar(var_name); + PADDLE_ENFORCE(var, "%s no in scope", var_name); + // TODO(panyx0718): hack, remove it once we run oprerator. + LoDTensor* tensor = var->GetMutable(); + int numel = 10; + float* data = + tensor->mutable_data(framework::make_ddim({numel}), + platform::CPUPlace(), sizeof(float) * numel); + for (size_t i = 0; i < numel; ++i) data[i] = 1; + + PADDLE_ENFORCE(var->IsType(), "Variable is not LoDTensor"); + return *var->GetMutable(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 7f504bfd23..031f8e01aa 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index efdabffb9b..6c60a041a1 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto2"; -option optimize_for = LITE_RUNTIME; +// option optimize_for = LITE_RUNTIME; package paddle.framework.proto; // Any incompatible changes to ProgramDesc and its dependencies should diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 42cc65ddc9..a83535af9c 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -14,20 +14,31 @@ #pragma once +#include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace imperative { -class TensorFuture { +class VariableBase { public: + VariableBase() {} + virtual ~VariableBase() {} + + framework::VarDesc* var_desc_; }; class Layer { public: virtual ~Layer() {} - virtual void Forward() { LOG(ERROR) << "forward at cpp."; } + virtual std::vector Forward( + const std::vector& inputs) { + std::vector vars; + return vars; + } virtual void Backward() { LOG(ERROR) << "backward at cpp."; } }; diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index bfab6bd9b9..9a558fbdb8 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/imperative/layer.h" #include "pybind11/pybind11.h" +#include "pybind11/stl.h" namespace paddle { namespace pybind { @@ -24,8 +26,10 @@ class PyLayer : public imperative::Layer { public: using imperative::Layer::Layer; // Inherit constructors - void Forward() override { - PYBIND11_OVERLOAD(void, Layer, Forward, ); // NOLINT + std::vector Forward( + const std::vector& inputs) override { + PYBIND11_OVERLOAD(std::vector, Layer, Forward, + inputs); // NOLINT } void Backward() override { @@ -33,7 +37,7 @@ class PyLayer : public imperative::Layer { } }; -void BindTracer(pybind11::module *m); +void BindTracer(pybind11::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fa3e383536..3cf1ec34a7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -101,9 +101,23 @@ PYBIND11_MODULE(core, m) { BindException(&m); + py::class_(m, "VariableBase", + R"DOC()DOC") + .def_property( + "desc", + [](const imperative::VariableBase &self) { return self.var_desc_; }, + [](imperative::VariableBase &self, framework::VarDesc *var_desc) { + self.var_desc_ = var_desc; + }, + py::return_value_policy::reference); + py::class_ layer(m, "Layer"); layer.def(py::init<>()) - .def("forward", &imperative::Layer::Forward) + .def("forward", + [](imperative::Layer &self, + const std::vector &inputs) { + return self.Forward(inputs); + }) .def("backward", &imperative::Layer::Backward); BindTracer(&m); @@ -608,6 +622,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 525c36702b..235d692afe 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,6 +18,7 @@ import collections import contextlib import re import six +import sys import numpy as np @@ -211,7 +212,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(object): +class Variable(core.VariableBase): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -282,15 +283,20 @@ class Variable(object): name = unique_name.generate('_generated_var') is_new_var = False name = cpt.to_text(name) - self.desc = self.block.desc.find_var(cpt.to_bytes(name)) + desc = self.block.desc.find_var(cpt.to_bytes(name)) - if self.desc is None: + if desc is None: + # sys.stderr.write('desc is None\n') self.desc = self.block.desc.var(cpt.to_bytes(name)) is_new_var = True + else: + # sys.stderr.write('found var %s %s' % (name, self.desc)) + self.desc = desc if is_new_var: self.desc.set_type(type) elif self.desc.type() != type: + # sys.stderr.write('%s vs %s\n' % (self.desc.type(), type)) raise ValueError("Variable {0} has been created before. The " "previous type is {1}; the new type is {2}. They" " are not matched".format(self.name, @@ -355,6 +361,10 @@ class Variable(object): self.stop_gradient = stop_gradient self.is_data = is_data + def numpy(self, scope): + tensor = core.get_variable_tensor(scope, self.desc.name()) + return np.array(tensor) + def __str__(self): return self.to_string(True) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index ae96a0a6b2..37b36f2cd0 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -12,14 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys +import numpy as np + from paddle.fluid import core +from paddle.fluid import framework __all__ = ['PyLayer'] class PyLayer(core.Layer): def __init__(self): - pass + self._scope = core.Scope() + + def __call__(self, inputs): + if not isinstance(inputs, list) and not isinstance(inputs, tuple): + inputs = [inputs] + + var_inputs = [] + for x in inputs: + if isinstance(x, np.ndarray): + tensor = core.LoDTensor() + tensor.set(x, core.CPUPlace()) + x = framework.Variable( + framework.default_main_program().current_block(), + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=x.shape, + dtype=x.dtype) + elif not isinstance(x, framework.Variable): + raise ValueError("not var or ndarray %s" % type(x)) + self._scope.var(x.name) + var_inputs.append(x) + outputs = self.forward(var_inputs) + for out in outputs: + self._scope.var(out.name) + return outputs - def forward(self): + def forward(self, inputs): print("at python.") + return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index dc317de9ab..ceabb52215 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,6 +17,8 @@ from __future__ import print_function import copy import itertools import six +import sys +import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name @@ -46,23 +48,43 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() + def _np_to_variable(self, x): + tensor = core.LoDTensor() + sys.stderr.write('%s %s\n' % (tensor, x)) + tensor.set(x, core.CPUPlace()) + return Variable( + self.main_program.current_block(), + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=x.shape, + dtype=x.dtype) + + def to_variable(self, x): + if isinstance(x, Variable): + return x + elif isinstance(x, np.ndarray): + return self._np_to_variable(x) + else: + raise ValueError("inputs wrong type %s\n" % x) + + def to_variables(self, inputs): + if isinstance(inputs, list) or isinstance(inputs, tuple): + return [self._to_variable(x) for x in inputs] + else: + return [self._to_variable(inputs)] + def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - type_error = TypeError( - "Input of {0} layer should be Variable or sequence of Variable". - format(self.layer_type)) - if isinstance(inputs, Variable): - inputs = [inputs] - elif not isinstance(inputs, list) and not isinstance(inputs, tuple): - raise type_error + ret = [] + if isinstance(inputs, list) or isinstance(inputs, tuple): + for inp in inputs: + ret.append(self.to_variable(inp)) else: - for each in inputs: - if not isinstance(each, Variable): - raise type_error - return inputs + ret.append(self.to_variable(inputs)) + return ret def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3c2975729c..35232bd489 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6455,7 +6455,8 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) + helper.append_op( + type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index cdd90accc1..a10b5b34aa 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -14,24 +14,42 @@ import unittest import sys +import numpy as np + import paddle.fluid as fluid from paddle.fluid import core +class MyLayer(fluid.imperative.PyLayer): + def __init__(self): + super(MyLayer, self).__init__() + + def forward(self, inputs): + x = fluid.layers.relu(inputs[0]) + return [fluid.layers.elementwise_mul(x, x)] + + class TestImperative(unittest.TestCase): def test_layer(self): cl = core.Layer() - cl.forward() + cl.forward([]) l = fluid.imperative.PyLayer() - l.forward() + l.forward([]) def test_imperative_trace(self): with fluid.imperative.guard(): self.assertTrue(fluid.imperative.enabled()) - x = fluid.layers.data(name='x', shape=[3, 4], dtype='float32') - x = fluid.layers.relu(x) - x = fluid.layers.elementwise_mul(x, x) - self.assertIsNotNone(x) + x = fluid.layers.data(name='abc', shape=[3, 4], dtype='float32') + for _ in xrange(2): + x = fluid.layers.relu(x) + x = fluid.layers.elementwise_mul(x, x) + self.assertIsNotNone(x) + + def test_layer_in_out(self): + l = MyLayer() + x = l(np.ones([1], np.float32))[0] + self.assertIsNotNone(x) + sys.stderr.write("%s output: %s\n" % (x, x.numpy(scope=l._scope))) if __name__ == '__main__': From aeb74af54c2428cdb4d6c541b5996a1a4ddd98a4 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 26 Nov 2018 15:25:45 +0800 Subject: [PATCH 072/719] allow operator to run imperatively --- paddle/fluid/framework/feed_fetch_method.cc | 10 +--- paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/imperative/tracer.h | 43 +++++++++++++- paddle/fluid/pybind/imperative.cc | 16 ++++- paddle/fluid/pybind/pybind.cc | 1 + python/paddle/fluid/imperative/layers.py | 58 ++++++++++++------- python/paddle/fluid/layers/nn.py | 1 + .../fluid/tests/unittests/test_imperative.py | 25 +++----- 8 files changed, 107 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index b13d0d3807..6338be75a4 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -58,15 +58,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { Variable* var = scope.FindVar(var_name); PADDLE_ENFORCE(var, "%s no in scope", var_name); - // TODO(panyx0718): hack, remove it once we run oprerator. - LoDTensor* tensor = var->GetMutable(); - int numel = 10; - float* data = - tensor->mutable_data(framework::make_ddim({numel}), - platform::CPUPlace(), sizeof(float) * numel); - for (size_t i = 0; i < numel; ++i) data[i] = 1; - - PADDLE_ENFORCE(var->IsType(), "Variable is not LoDTensor"); + PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); return *var->GetMutable(); } diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bba..8679118fe2 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); + LOG(ERROR) << "Cannot add backward operator before forward operator " + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 91e34a4783..8a7a2d700f 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -14,8 +14,12 @@ #pragma once +#include #include + #include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/engine.h" namespace paddle { @@ -26,10 +30,47 @@ class Tracer { Tracer() {} void Trace(framework::OpDesc* op_desc) { - LOG(ERROR) << "tracing " << op_desc->Type(); + LOG(ERROR) << "tracer tracing " << op_desc->Type(); + op_desc->InferShape(*block_); + op_desc->InferVarType(block_); + std::unique_ptr op = + framework::OpRegistry::CreateOp(*op_desc); + for (const std::string& vname : op_desc->InputArgumentNames()) { + framework::Variable* var = scope_->Var(vname); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + } + for (const std::string& vname : op_desc->OutputArgumentNames()) { + framework::Variable* var = scope_->Var(vname); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + } + op->Run(*scope_, platform::CPUPlace()); } + void SetScope(framework::Scope* scope) { scope_ = scope; } + + void SetBlock(framework::BlockDesc* block) { block_ = block; } + + framework::Scope* Scope() const { return scope_; } + + framework::BlockDesc* Block() const { return block_; } + private: + framework::BlockDesc* block_; + framework::Scope* scope_; std::vector runnables_; }; diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index cd97cd6352..0e0f5a69a7 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/tracer.h" namespace paddle { @@ -22,7 +24,19 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def(pybind11::init<>()) - .def("trace", &imperative::Tracer::Trace); + .def("trace", &imperative::Tracer::Trace) + .def_property("scope", + [](const imperative::Tracer &self) { return self.Scope(); }, + [](imperative::Tracer &self, framework::Scope *scope) { + self.SetScope(scope); + }, + R"DOC()DOC") + .def_property("block", + [](const imperative::Tracer &self) { return self.Block(); }, + [](imperative::Tracer &self, framework::BlockDesc *block) { + self.SetBlock(block); + }, + R"DOC()DOC"); } } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3cf1ec34a7..03fe1402c3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -159,6 +159,7 @@ PYBIND11_MODULE(core, m) { self.mutable_data(place); }) .def("set", PyCPUTensorSetFromArray) + .def("set_float", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 37b36f2cd0..32928347af 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import sys import numpy as np @@ -21,33 +22,46 @@ from paddle.fluid import framework __all__ = ['PyLayer'] +@contextlib.contextmanager +def trace_scope(scope, block): + tmp_scope = framework._imperative_tracer().scope + tmp_block = framework._imperative_tracer().block + framework._imperative_tracer().scope = scope + framework._imperative_tracer().block = block + yield + framework._imperative_tracer().scope = tmp_scope + framework._imperative_tracer().block = tmp_block + + class PyLayer(core.Layer): def __init__(self): self._scope = core.Scope() + self._block = framework.default_main_program().current_block() def __call__(self, inputs): - if not isinstance(inputs, list) and not isinstance(inputs, tuple): - inputs = [inputs] - - var_inputs = [] - for x in inputs: - if isinstance(x, np.ndarray): - tensor = core.LoDTensor() - tensor.set(x, core.CPUPlace()) - x = framework.Variable( - framework.default_main_program().current_block(), - type=core.VarDesc.VarType.LOD_TENSOR, - name=None, - shape=x.shape, - dtype=x.dtype) - elif not isinstance(x, framework.Variable): - raise ValueError("not var or ndarray %s" % type(x)) - self._scope.var(x.name) - var_inputs.append(x) - outputs = self.forward(var_inputs) - for out in outputs: - self._scope.var(out.name) - return outputs + with trace_scope(self._scope, self._block.desc): + if not isinstance(inputs, list) and not isinstance(inputs, tuple): + inputs = [inputs] + + var_inputs = [] + for x in inputs: + if isinstance(x, np.ndarray): + py_var = framework.Variable( + self._block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=x.shape, + dtype=x.dtype) + var = self._scope.var(py_var.name) + tensor = var.get_tensor() + tensor.set_float(x, core.CPUPlace()) + var_inputs.append(py_var) + elif isinstance(x, framework.Variable): + var_inputs.append(x) + else: + raise ValueError("not var or ndarray %s" % type(x)) + outputs = self.forward(var_inputs) + return outputs def forward(self, inputs): print("at python.") diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 35232bd489..69ac968966 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -17,6 +17,7 @@ All layers just related to the neural network. from __future__ import print_function +import sys import numpy as np import os from ..layer_helper import LayerHelper diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index a10b5b34aa..af6a2167ce 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -31,25 +31,18 @@ class MyLayer(fluid.imperative.PyLayer): class TestImperative(unittest.TestCase): def test_layer(self): - cl = core.Layer() - cl.forward([]) - l = fluid.imperative.PyLayer() - l.forward([]) - - def test_imperative_trace(self): with fluid.imperative.guard(): - self.assertTrue(fluid.imperative.enabled()) - x = fluid.layers.data(name='abc', shape=[3, 4], dtype='float32') - for _ in xrange(2): - x = fluid.layers.relu(x) - x = fluid.layers.elementwise_mul(x, x) - self.assertIsNotNone(x) + cl = core.Layer() + cl.forward([]) + l = fluid.imperative.PyLayer() + l.forward([]) def test_layer_in_out(self): - l = MyLayer() - x = l(np.ones([1], np.float32))[0] - self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x.numpy(scope=l._scope))) + with fluid.imperative.guard(): + l = MyLayer() + x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + self.assertIsNotNone(x) + sys.stderr.write("%s output: %s\n" % (x, x.numpy(scope=l._scope))) if __name__ == '__main__': From 0318c95149ded321160a00f53cada4af0996505b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 26 Nov 2018 16:21:43 +0800 Subject: [PATCH 073/719] rebase develop --- paddle/fluid/operators/activation_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index cc5f90cd11..832245371e 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -66,7 +66,7 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const framework::OperatorWithKernel& oper, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; - framework::DataLayout layout = framework::ddle/fluid/pybind/CMakeLists.txtDataLayout::kAnyLayout; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && From f6f06924512553e4cc8c8eb299730565dc297006 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 26 Nov 2018 19:07:17 +0800 Subject: [PATCH 074/719] clean up test=develop --- paddle/fluid/pybind/pybind.cc | 1 - python/paddle/fluid/framework.py | 8 ++------ python/paddle/fluid/imperative/layers.py | 2 +- python/paddle/fluid/layer_helper.py | 7 ------- 4 files changed, 3 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 03fe1402c3..3cf1ec34a7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -159,7 +159,6 @@ PYBIND11_MODULE(core, m) { self.mutable_data(place); }) .def("set", PyCPUTensorSetFromArray) - .def("set_float", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 235d692afe..3d3263e7cd 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -283,15 +283,11 @@ class Variable(core.VariableBase): name = unique_name.generate('_generated_var') is_new_var = False name = cpt.to_text(name) - desc = self.block.desc.find_var(cpt.to_bytes(name)) + self.desc = self.block.desc.find_var(cpt.to_bytes(name)) - if desc is None: - # sys.stderr.write('desc is None\n') + if self.desc is None: self.desc = self.block.desc.var(cpt.to_bytes(name)) is_new_var = True - else: - # sys.stderr.write('found var %s %s' % (name, self.desc)) - self.desc = desc if is_new_var: self.desc.set_type(type) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 32928347af..15f6939846 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -54,7 +54,7 @@ class PyLayer(core.Layer): dtype=x.dtype) var = self._scope.var(py_var.name) tensor = var.get_tensor() - tensor.set_float(x, core.CPUPlace()) + tensor.set(x, core.CPUPlace()) var_inputs.append(py_var) elif isinstance(x, framework.Variable): var_inputs.append(x) diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index ceabb52215..ca531f3be9 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -50,7 +50,6 @@ class LayerHelper(object): def _np_to_variable(self, x): tensor = core.LoDTensor() - sys.stderr.write('%s %s\n' % (tensor, x)) tensor.set(x, core.CPUPlace()) return Variable( self.main_program.current_block(), @@ -67,12 +66,6 @@ class LayerHelper(object): else: raise ValueError("inputs wrong type %s\n" % x) - def to_variables(self, inputs): - if isinstance(inputs, list) or isinstance(inputs, tuple): - return [self._to_variable(x) for x in inputs] - else: - return [self._to_variable(inputs)] - def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) From 8138391631aed37b2832b66ede3f70a9aff1ea0e Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 28 Nov 2018 16:13:13 +0800 Subject: [PATCH 075/719] add OpBase and unify with VarBase test=develop --- paddle/fluid/imperative/CMakeLists.txt | 2 +- paddle/fluid/imperative/layer.h | 20 ++++++--- paddle/fluid/imperative/tracer.h | 10 +++-- paddle/fluid/pybind/imperative.h | 6 +-- paddle/fluid/pybind/pybind.cc | 19 ++++++--- python/paddle/fluid/framework.py | 58 +++++++++++++------------- 6 files changed, 69 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index dff80dff0b..fb57eca658 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,3 +1,3 @@ -cc_library(layer SRCS layer.cc) +cc_library(layer SRCS layer.cc DEPS proto_desc) cc_library(tracer SRCS tracer.cc DEPS proto_desc) cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index a83535af9c..cf352ebe53 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -15,6 +15,7 @@ #pragma once #include +#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" @@ -22,21 +23,28 @@ namespace paddle { namespace imperative { -class VariableBase { +class VarBase { public: - VariableBase() {} - virtual ~VariableBase() {} + VarBase() {} + virtual ~VarBase() {} framework::VarDesc* var_desc_; }; +class OpBase { + public: + OpBase() {} + virtual ~OpBase() {} + + framework::OpDesc* op_desc_; +}; + class Layer { public: virtual ~Layer() {} - virtual std::vector Forward( - const std::vector& inputs) { - std::vector vars; + virtual std::vector Forward(const std::vector& inputs) { + std::vector vars; return vars; } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 8a7a2d700f..c5dea96863 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include @@ -21,6 +22,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/engine.h" +#include "paddle/fluid/imperative/layer.h" namespace paddle { namespace imperative { @@ -29,11 +31,13 @@ class Tracer { public: Tracer() {} - void Trace(framework::OpDesc* op_desc) { + void Trace(OpBase* op, const std::map& inputs, + const std::map& outputs) { + framework::OpDesc* op_desc = op->op_desc_; LOG(ERROR) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block_); op_desc->InferVarType(block_); - std::unique_ptr op = + std::unique_ptr op_base = framework::OpRegistry::CreateOp(*op_desc); for (const std::string& vname : op_desc->InputArgumentNames()) { framework::Variable* var = scope_->Var(vname); @@ -57,7 +61,7 @@ class Tracer { } } } - op->Run(*scope_, platform::CPUPlace()); + op_base->Run(*scope_, platform::CPUPlace()); } void SetScope(framework::Scope* scope) { scope_ = scope; } diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 9a558fbdb8..bf01ed3a25 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -26,9 +26,9 @@ class PyLayer : public imperative::Layer { public: using imperative::Layer::Layer; // Inherit constructors - std::vector Forward( - const std::vector& inputs) override { - PYBIND11_OVERLOAD(std::vector, Layer, Forward, + std::vector Forward( + const std::vector& inputs) override { + PYBIND11_OVERLOAD(std::vector, Layer, Forward, inputs); // NOLINT } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3cf1ec34a7..cd0e550b90 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -101,21 +101,30 @@ PYBIND11_MODULE(core, m) { BindException(&m); - py::class_(m, "VariableBase", - R"DOC()DOC") + py::class_(m, "VarBase", + R"DOC()DOC") .def_property( "desc", - [](const imperative::VariableBase &self) { return self.var_desc_; }, - [](imperative::VariableBase &self, framework::VarDesc *var_desc) { + [](const imperative::VarBase &self) { return self.var_desc_; }, + [](imperative::VarBase &self, framework::VarDesc *var_desc) { self.var_desc_ = var_desc; }, py::return_value_policy::reference); + py::class_(m, "OpBase", + R"DOC()DOC") + .def_property( + "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, + [](imperative::OpBase &self, framework::OpDesc *op_desc) { + self.op_desc_ = op_desc; + }, + py::return_value_policy::reference); + py::class_ layer(m, "Layer"); layer.def(py::init<>()) .def("forward", [](imperative::Layer &self, - const std::vector &inputs) { + const std::vector &inputs) { return self.Forward(inputs); }) .def("backward", &imperative::Layer::Backward); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3d3263e7cd..51da8260e2 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -212,7 +212,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(core.VariableBase): +class Variable(core.VarBase): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -507,7 +507,7 @@ class OpProtoHolder(object): } -class Operator(object): +class Operator(core.OpBase): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -602,32 +602,33 @@ class Operator(object): return True return False - if inputs is not None: - for in_proto in proto.inputs: - found = find_name(inputs, in_proto.name) - assert found or in_proto.dispensable, "Input {} not found".format( - in_proto.name) - - if found: - in_args = inputs[in_proto.name] - if not isinstance(in_args, list): - in_args = [in_args] - if not in_proto.duplicable and len(in_args) > 1: - raise ValueError( - "Input %s expects only one input, but %d are given." - % (in_proto.name, len(in_args))) - in_arg_names = [] - for arg in in_args: - if isinstance(arg, six.string_types): - in_arg_names.append(arg) - elif isinstance(arg, six.binary_type): - in_arg_names.append(arg.decode()) - else: - in_arg_names.append(cpt.to_text(arg.name)) - self.desc.set_input(in_proto.name, in_arg_names) - else: - self.desc.set_input(in_proto.name, []) + self.inputs = [] if not inputs else inputs + for in_proto in proto.inputs: + found = find_name(self.inputs, in_proto.name) + assert found or in_proto.dispensable, "Input {} not found".format( + in_proto.name) + + if found: + in_args = self.inputs[in_proto.name] + if not isinstance(in_args, list): + in_args = [in_args] + if not in_proto.duplicable and len(in_args) > 1: + raise ValueError( + "Input %s expects only one input, but %d are given." % + (in_proto.name, len(in_args))) + in_arg_names = [] + for arg in in_args: + if isinstance(arg, six.string_types): + in_arg_names.append(arg) + elif isinstance(arg, six.binary_type): + in_arg_names.append(arg.decode()) + else: + in_arg_names.append(cpt.to_text(arg.name)) + self.desc.set_input(in_proto.name, in_arg_names) + else: + self.desc.set_input(in_proto.name, []) + self.outputs = [] if not outputs else outputs if outputs is not None: given = set() need = set() @@ -1222,7 +1223,8 @@ class Block(object): if _in_imperative_mode(): op_desc = core.OpDesc() op = Operator(block=self, desc=op_desc, *args, **kwargs) - _imperative_tracer().trace(op.desc) + sys.stderr.write('%s %s!!!\n' % (type(op.inputs), type(op.outputs))) + _imperative_tracer().trace(op, op.inputs, op.outputs) return op_desc = self.desc.append_op() From 4d0df1fea7c8e4b480d52c01080381e2e9e88221 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 28 Nov 2018 17:59:33 +0800 Subject: [PATCH 076/719] add fields for autograd test=develop --- paddle/fluid/imperative/layer.h | 14 ++++++- paddle/fluid/imperative/tracer.h | 15 +++++-- paddle/fluid/pybind/imperative.h | 5 +++ paddle/fluid/pybind/pybind.cc | 5 ++- python/paddle/fluid/framework.py | 67 +++++++++++++++++++------------- 5 files changed, 72 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index cf352ebe53..c400b19700 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -23,19 +23,29 @@ namespace paddle { namespace imperative { +class OpBase; + class VarBase { public: VarBase() {} virtual ~VarBase() {} + OpBase* pre_op_; framework::VarDesc* var_desc_; }; class OpBase { public: - OpBase() {} - virtual ~OpBase() {} + OpBase() + : input_vars_(new std::vector()), + output_vars_(new std::vector()) {} + virtual ~OpBase() { + delete input_vars_; + delete output_vars_; + } + std::vector* input_vars_; + std::vector* output_vars_; framework::OpDesc* op_desc_; }; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index c5dea96863..9d7bdda8cc 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -31,15 +31,18 @@ class Tracer { public: Tracer() {} - void Trace(OpBase* op, const std::map& inputs, - const std::map& outputs) { + void Trace(OpBase* op, const std::vector& inputs, + const std::vector& outputs) { framework::OpDesc* op_desc = op->op_desc_; LOG(ERROR) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block_); op_desc->InferVarType(block_); std::unique_ptr op_base = framework::OpRegistry::CreateOp(*op_desc); - for (const std::string& vname : op_desc->InputArgumentNames()) { + + *op->input_vars_ = inputs; + for (VarBase* input : inputs) { + const std::string vname = input->var_desc_->Name(); framework::Variable* var = scope_->Var(vname); if (!var->IsInitialized()) { framework::VarDesc* var_desc = block_->FindVar(vname); @@ -50,7 +53,10 @@ class Tracer { } } } - for (const std::string& vname : op_desc->OutputArgumentNames()) { + + *op->output_vars_ = outputs; + for (auto output : outputs) { + const std::string vname = output->var_desc_->Name(); framework::Variable* var = scope_->Var(vname); if (!var->IsInitialized()) { framework::VarDesc* var_desc = block_->FindVar(vname); @@ -60,6 +66,7 @@ class Tracer { LOG(ERROR) << "tracer doesn't support yet"; } } + output->pre_op_ = op; } op_base->Run(*scope_, platform::CPUPlace()); } diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index bf01ed3a25..5834b83df9 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -37,6 +37,11 @@ class PyLayer : public imperative::Layer { } }; +class PyOpBase : public imperative::OpBase { + public: + using imperative::OpBase::OpBase; // Inherit constructors +}; + void BindTracer(pybind11::module* m); } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cd0e550b90..656d28eb2a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -111,8 +111,9 @@ PYBIND11_MODULE(core, m) { }, py::return_value_policy::reference); - py::class_(m, "OpBase", - R"DOC()DOC") + py::class_(m, "OpBase", + R"DOC()DOC") + .def(py::init<>()) .def_property( "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, [](imperative::OpBase &self, framework::OpDesc *op_desc) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 51da8260e2..d4ca6901d5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -563,6 +563,7 @@ class Operator(core.OpBase): inputs=None, outputs=None, attrs=None): + core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -602,33 +603,32 @@ class Operator(core.OpBase): return True return False - self.inputs = [] if not inputs else inputs - for in_proto in proto.inputs: - found = find_name(self.inputs, in_proto.name) - assert found or in_proto.dispensable, "Input {} not found".format( - in_proto.name) - - if found: - in_args = self.inputs[in_proto.name] - if not isinstance(in_args, list): - in_args = [in_args] - if not in_proto.duplicable and len(in_args) > 1: - raise ValueError( - "Input %s expects only one input, but %d are given." % - (in_proto.name, len(in_args))) - in_arg_names = [] - for arg in in_args: - if isinstance(arg, six.string_types): - in_arg_names.append(arg) - elif isinstance(arg, six.binary_type): - in_arg_names.append(arg.decode()) - else: - in_arg_names.append(cpt.to_text(arg.name)) - self.desc.set_input(in_proto.name, in_arg_names) - else: - self.desc.set_input(in_proto.name, []) + if inputs is not None: + for in_proto in proto.inputs: + found = find_name(inputs, in_proto.name) + assert found or in_proto.dispensable, "Input {} not found".format( + in_proto.name) + + if found: + in_args = inputs[in_proto.name] + if not isinstance(in_args, list): + in_args = [in_args] + if not in_proto.duplicable and len(in_args) > 1: + raise ValueError( + "Input %s expects only one input, but %d are given." + % (in_proto.name, len(in_args))) + in_arg_names = [] + for arg in in_args: + if isinstance(arg, six.string_types): + in_arg_names.append(arg) + elif isinstance(arg, six.binary_type): + in_arg_names.append(arg.decode()) + else: + in_arg_names.append(cpt.to_text(arg.name)) + self.desc.set_input(in_proto.name, in_arg_names) + else: + self.desc.set_input(in_proto.name, []) - self.outputs = [] if not outputs else outputs if outputs is not None: given = set() need = set() @@ -657,6 +657,21 @@ class Operator(core.OpBase): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) + input_vars = [] + for inp in inputs.values(): + if isinstance(inp, Variable): + input_vars.append(inp) + elif isinstance(inp, list): + input_vars.extend(inp[:]) + self.inputs = input_vars + output_vars = [] + for out in outputs.values(): + if isinstance(out, Variable): + output_vars.append(out) + elif isinstance(inp, list): + output_vars.extend(out[:]) + self.outputs = output_vars + if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") From e5d64fd4d114830b855cda8a740c435cffc277f2 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 2 Dec 2018 16:39:21 +0800 Subject: [PATCH 077/719] initial imperative test=develop --- paddle/fluid/imperative/CMakeLists.txt | 2 +- paddle/fluid/imperative/layer.cc | 252 +++++++++++++++++- paddle/fluid/imperative/layer.h | 48 +++- paddle/fluid/imperative/tracer.h | 36 ++- paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/imperative.h | 5 + paddle/fluid/pybind/pybind.cc | 17 +- python/paddle/fluid/framework.py | 17 +- .../fluid/tests/unittests/test_imperative.py | 3 + 9 files changed, 362 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index fb57eca658..373d292b44 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,3 +1,3 @@ -cc_library(layer SRCS layer.cc DEPS proto_desc) +cc_library(layer SRCS layer.cc DEPS proto_desc operator) cc_library(tracer SRCS tracer.cc DEPS proto_desc) cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index e0b6ad91e7..08379a7ed5 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -13,7 +13,257 @@ // limitations under the License. #include "paddle/fluid/imperative/layer.h" +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/string/printf.h" namespace paddle { -namespace imperative {} // namespace imperative +namespace imperative { + +using framework::Variable; + +void AddTo(Variable* src, Variable* dst) { + framework::LoDTensor* dst_tensor = dst->GetMutable(); + framework::LoDTensor* src_tensor = src->GetMutable(); + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", + dst_tensor->numel(), src_tensor->numel()); + float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); + const float* src_data = src_tensor->data(); + for (size_t i = 0; i < src_tensor->numel(); ++i) { + dst_data[i] += src_data[i]; + } +} + +class Autograd { + public: + explicit Autograd(framework::Scope* scope) : scope_(scope) {} + + void RunBackward(VarBase* var, framework::Variable* grad) { + if (!var->pre_op_) { + var->ApplyGrad(scope_, grad); + return; + } + PADDLE_ENFORCE(var->pre_op_->op_desc_); + // TODO(panyx0718): Only create vars that "require_grad" + std::vector op_grads = + CreateOpGrads(var->pre_op_->output_vars_->size()); + op_grads[var->pre_op_out_idx_] = grad; + + std::deque>> ready; + ready.push_back(std::make_pair(var->pre_op_, op_grads)); + + std::map dep_counts = ComputeDepCounts(var->pre_op_); + std::map> visited; + + while (!ready.empty()) { + OpBase* ready_op = ready.front().first; + std::vector ready_op_grads = ready.front().second; + ready.pop_front(); + std::vector input_grads = ready_op->ApplyGrad(scope_); + + for (size_t i = 0; i < input_grads.size(); ++i) { + if (!input_grads[i]) continue; + OpBase* pre_op = ready_op->pre_ops_->at(i); + if (!pre_op) continue; + int pre_op_out_idx = ready_op->pre_ops_out_idx_->at(i); + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + + if (pre_op_ready) { + if (visited.find(pre_op) == visited.end()) { + PADDLE_ENFORCE(pre_op->output_vars_->size() == 1); + visited[pre_op] = {input_grads[i]}; + } else { + std::vector& pre_op_grads = visited[pre_op]; + AccumGrads(pre_op_out_idx, input_grads[i], &pre_op_grads); + } + ready.push_back(std::make_pair(pre_op, visited[pre_op])); + } else { + if (visited.find(pre_op) == visited.end()) { + // TODO(panyx0718): Only create vars that "require_grad" + visited[pre_op] = CreateOpGrads(var->pre_op_->output_vars_->size()); + } else { + } + std::vector& pre_op_grads = visited[pre_op]; + AccumGrads(pre_op_out_idx, input_grads[i], &pre_op_grads); + } + } + } + } + + private: + void AccumGrads(int grad_idx, Variable* grad, + std::vector* op_grads) { + if (!(*op_grads)[grad_idx]) { + // FIXME(panyx0718): This should be a deep copy. + (*op_grads)[grad_idx] = grad; + return; + } + AddTo(grad, (*op_grads)[grad_idx]); + } + + std::map ComputeDepCounts(OpBase* op) { + std::map ret; + + std::deque queue; + queue.push_back(op); + std::unordered_set visited; + visited.insert(op); + while (!queue.empty()) { + OpBase* candidate = queue.front(); + queue.pop_front(); + for (OpBase* pre_op : *(candidate->pre_ops_)) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; + } + } + + return ret; + } + + std::vector CreateOpGrads(size_t count) { + std::vector op_grads; + for (size_t i = 0; i < count; ++i) { + op_grads.push_back(nullptr); + } + return op_grads; + } + + framework::Scope* scope_; +}; + +framework::Variable* CreateVariable(const std::string& name, + const framework::DDim& dim, float val, + framework::Scope* scope, + bool random_name = true) { + std::string varname = name; + if (random_name) { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + int id = dist6(rng); + varname = string::Sprintf("%s@%d", varname, id); + } + + LOG(ERROR) << "creating var " << varname; + framework::Variable* var = scope->Var(varname); + framework::LoDTensor* tensor = var->GetMutable(); + + float* data = tensor->mutable_data(dim, platform::CPUPlace()); + std::fill(data, data + tensor->numel(), val); + return var; +} + +framework::LoDTensor& VarBase::Grad() { + VLOG(3) << "get var grad " << var_desc_->Name(); + return *grads_->GetMutable(); +} + +void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { + VLOG(3) << "apply var grad " << var_desc_->Name() << " " + << grad->Get().data()[0]; + if (!grads_) { + grads_ = + CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), + var_->Get().dims(), 0.0, scope); + } + AddTo(grad, grads_); + VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " + << grads_->Get().data()[0]; +} + +std::vector OpBase::ApplyGrad(framework::Scope* scope) { + VLOG(3) << "op grad " << grad_op_desc_->Type(); + + for (const std::string& invar : grad_op_desc_->InputArgumentNames()) { + block_->FindRecursiveOrCreateVar(invar); + framework::Variable* var = scope->Var(invar); + LOG(ERROR) << "op grad in var " << invar; + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(invar); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + LOG(ERROR) << "grad op invar init " << invar; + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } else { + var->GetMutable()->type(); + } + } + + std::vector ret; + for (size_t i = 0; i < input_vars_->size(); ++i) { + ret.push_back(nullptr); + } + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + LOG(ERROR) << "grad outvar " << outvar; + block_->FindRecursiveOrCreateVar(outvar); + framework::Variable* var = scope->Var(outvar); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(outvar); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + } + grad_op_desc_->InferShape(*block_); + grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc_); + + opbase->Run(*scope, platform::CPUPlace()); + + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + if (grad_to_var_->find(outvar) != grad_to_var_->end()) { + std::string origin_var = (*grad_to_var_)[outvar]; + for (size_t i = 0; i < input_vars_->size(); ++i) { + VarBase* origin_in_var = (*input_vars_)[i]; + if (origin_in_var->var_desc_->Name() == origin_var) { + framework::Variable* var = scope->FindVar(outvar); + LOG(ERROR) << "apply grad " << outvar << " with origin " + << origin_var; + // TODO(panyx0718): Accumulate. + // origin_in_var->grads_ = var; + origin_in_var->ApplyGrad(scope, var); + ret[i] = var; + // TODO(panyx0718): There might be 2 var with the same name. We + // currently assume the are the same Variable*. So it doesn't matter + // which one is used. + break; + } + } + } + } + return ret; +} + +void VarBase::RunBackward(framework::Scope* scope) { + // TODO(panyx0718): Might not be 0th, need to detect. + grads_ = CreateVariable(pre_op_->grad_op_desc_->InputArgumentNames()[0], + var_->Get().dims(), 1.0, scope, + false); + framework::Variable* grad = + CreateVariable("init@imperative_grad", + var_->Get().dims(), 1.0, scope); + + Autograd(scope).RunBackward(this, grad); +} + +} // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index c400b19700..0cf0c27a6a 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -14,8 +14,10 @@ #pragma once +#include #include #include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" @@ -27,26 +29,64 @@ class OpBase; class VarBase { public: - VarBase() {} - virtual ~VarBase() {} + VarBase() + : pre_op_(nullptr), + pre_op_out_idx_(-1), + var_desc_(nullptr), + var_(nullptr), + grads_(nullptr) {} + + virtual ~VarBase() { + LOG(ERROR) << "deleting var"; + LOG(ERROR) << "done deleting var"; + } + + void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + + void RunBackward(framework::Scope* scope); + + framework::LoDTensor& Grad(); OpBase* pre_op_; + int pre_op_out_idx_; + framework::VarDesc* var_desc_; + framework::Variable* var_; + framework::Variable* grads_; }; class OpBase { public: OpBase() : input_vars_(new std::vector()), - output_vars_(new std::vector()) {} + output_vars_(new std::vector()), + pre_ops_(new std::vector()), + pre_ops_out_idx_(new std::vector()), + op_desc_(nullptr), + grad_op_desc_(nullptr) {} + virtual ~OpBase() { delete input_vars_; delete output_vars_; + + delete pre_ops_; + delete pre_ops_out_idx_; + + if (grad_op_desc_) delete grad_op_desc_; + if (grad_to_var_) delete grad_to_var_; } + std::vector ApplyGrad(framework::Scope* scope); + std::vector* input_vars_; std::vector* output_vars_; + std::vector* pre_ops_; + std::vector* pre_ops_out_idx_; framework::OpDesc* op_desc_; + + framework::OpDesc* grad_op_desc_; + std::unordered_map* grad_to_var_; + framework::BlockDesc* block_; }; class Layer { @@ -58,7 +98,7 @@ class Layer { return vars; } - virtual void Backward() { LOG(ERROR) << "backward at cpp."; } + virtual void Backward() { LOG(ERROR) << "To support customize"; } }; } // namespace imperative diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 9d7bdda8cc..c3cc0cb33e 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -27,6 +27,20 @@ namespace paddle { namespace imperative { +void CreateGradOp(const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + framework::OpDesc** grad_op_desc, + std::unordered_map* grad_to_var) { + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); + // TODO(panyx0718): Leak? + *grad_op_desc = grad_op_descs[0].release(); +} + class Tracer { public: Tracer() {} @@ -44,6 +58,7 @@ class Tracer { for (VarBase* input : inputs) { const std::string vname = input->var_desc_->Name(); framework::Variable* var = scope_->Var(vname); + input->var_ = var; if (!var->IsInitialized()) { framework::VarDesc* var_desc = block_->FindVar(vname); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { @@ -52,11 +67,17 @@ class Tracer { LOG(ERROR) << "tracer doesn't support yet"; } } + if (input->pre_op_) { + op->pre_ops_->push_back(input->pre_op_); + op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); + } else { + op->pre_ops_->push_back(nullptr); + } } *op->output_vars_ = outputs; - for (auto output : outputs) { - const std::string vname = output->var_desc_->Name(); + for (size_t i = 0; i < outputs.size(); ++i) { + const std::string vname = outputs[i]->var_desc_->Name(); framework::Variable* var = scope_->Var(vname); if (!var->IsInitialized()) { framework::VarDesc* var_desc = block_->FindVar(vname); @@ -66,9 +87,18 @@ class Tracer { LOG(ERROR) << "tracer doesn't support yet"; } } - output->pre_op_ = op; + outputs[i]->var_ = var; + outputs[i]->pre_op_ = op; + outputs[i]->pre_op_out_idx_ = i; } op_base->Run(*scope_, platform::CPUPlace()); + + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block_}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + op->block_ = block_; } void SetScope(framework::Scope* scope) { scope_ = scope; } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 7079f9fe04..b8954cb126 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) if(WITH_PYTHON) diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 5834b83df9..7a9d3a01ea 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -42,6 +42,11 @@ class PyOpBase : public imperative::OpBase { using imperative::OpBase::OpBase; // Inherit constructors }; +class PyVarBase : public imperative::VarBase { + public: + using imperative::VarBase::VarBase; // Inherit constructors +}; + void BindTracer(pybind11::module* m); } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 656d28eb2a..ea07372a28 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -101,8 +102,13 @@ PYBIND11_MODULE(core, m) { BindException(&m); - py::class_(m, "VarBase", - R"DOC()DOC") + py::class_(m, "VarBase", R"DOC()DOC") + .def(py::init<>()) + .def("_run_backward", + [](imperative::VarBase &self, framework::Scope *scope) { + self.RunBackward(scope); + }) + .def("_grad", &imperative::VarBase::Grad) .def_property( "desc", [](const imperative::VarBase &self) { return self.var_desc_; }, @@ -111,13 +117,14 @@ PYBIND11_MODULE(core, m) { }, py::return_value_policy::reference); - py::class_(m, "OpBase", - R"DOC()DOC") + py::class_(m, "OpBase", R"DOC()DOC") .def(py::init<>()) .def_property( "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, [](imperative::OpBase &self, framework::OpDesc *op_desc) { - self.op_desc_ = op_desc; + if (op_desc) { + self.op_desc_ = op_desc; + } }, py::return_value_policy::reference); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d4ca6901d5..ba3ffafc85 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -276,6 +276,7 @@ class Variable(core.VarBase): stop_gradient=False, is_data=False, **kwargs): + core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -361,6 +362,12 @@ class Variable(core.VarBase): tensor = core.get_variable_tensor(scope, self.desc.name()) return np.array(tensor) + def backward(self, scope): + self._run_backward(scope) + + def grad(self): + return np.array(self._grad()) + def __str__(self): return self.to_string(True) @@ -983,6 +990,7 @@ class Block(object): self.desc = program.desc.block(idx) self.vars = collections.OrderedDict() # var_name --> var self.ops = list() # operator list + self._op_descs = list() self.program = program self.removed_vars = collections.OrderedDict() @@ -1238,13 +1246,12 @@ class Block(object): if _in_imperative_mode(): op_desc = core.OpDesc() op = Operator(block=self, desc=op_desc, *args, **kwargs) - sys.stderr.write('%s %s!!!\n' % (type(op.inputs), type(op.outputs))) _imperative_tracer().trace(op, op.inputs, op.outputs) - return - - op_desc = self.desc.append_op() - op = Operator(block=self, desc=op_desc, *args, **kwargs) + else: + op_desc = self.desc.append_op() + op = Operator(block=self, desc=op_desc, *args, **kwargs) self.ops.append(op) + self._op_descs.append(op_desc) return op def _insert_op(self, index, *args, **kwargs): diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index af6a2167ce..18fe8f7c09 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -26,6 +26,7 @@ class MyLayer(fluid.imperative.PyLayer): def forward(self, inputs): x = fluid.layers.relu(inputs[0]) + self._x_for_debug = x return [fluid.layers.elementwise_mul(x, x)] @@ -43,6 +44,8 @@ class TestImperative(unittest.TestCase): x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] self.assertIsNotNone(x) sys.stderr.write("%s output: %s\n" % (x, x.numpy(scope=l._scope))) + x.backward(l._scope) + sys.stderr.write("grad %s\n" % l._x_for_debug.grad()) if __name__ == '__main__': From c3236f82d698a017e4dffa6d2a0c912a594a43f8 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 2 Dec 2018 17:11:58 +0800 Subject: [PATCH 078/719] polish --- paddle/fluid/imperative/layer.cc | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 08379a7ed5..c5a8d9c6b6 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -44,16 +44,12 @@ class Autograd { public: explicit Autograd(framework::Scope* scope) : scope_(scope) {} - void RunBackward(VarBase* var, framework::Variable* grad) { - if (!var->pre_op_) { - var->ApplyGrad(scope_, grad); - return; - } + void RunBackward(VarBase* var) { PADDLE_ENFORCE(var->pre_op_->op_desc_); // TODO(panyx0718): Only create vars that "require_grad" std::vector op_grads = CreateOpGrads(var->pre_op_->output_vars_->size()); - op_grads[var->pre_op_out_idx_] = grad; + op_grads[var->pre_op_out_idx_] = var->grads_; std::deque>> ready; ready.push_back(std::make_pair(var->pre_op_, op_grads)); @@ -238,8 +234,6 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { framework::Variable* var = scope->FindVar(outvar); LOG(ERROR) << "apply grad " << outvar << " with origin " << origin_var; - // TODO(panyx0718): Accumulate. - // origin_in_var->grads_ = var; origin_in_var->ApplyGrad(scope, var); ret[i] = var; // TODO(panyx0718): There might be 2 var with the same name. We @@ -254,15 +248,11 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { } void VarBase::RunBackward(framework::Scope* scope) { - // TODO(panyx0718): Might not be 0th, need to detect. - grads_ = CreateVariable(pre_op_->grad_op_desc_->InputArgumentNames()[0], + grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), var_->Get().dims(), 1.0, scope, false); - framework::Variable* grad = - CreateVariable("init@imperative_grad", - var_->Get().dims(), 1.0, scope); - - Autograd(scope).RunBackward(this, grad); + if (!pre_op_) return; + Autograd(scope).RunBackward(this); } } // namespace imperative From 93c16d96289e2805c01fb0bc36f4eecb854fb920 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 2 Dec 2018 22:23:12 +0800 Subject: [PATCH 079/719] polish the autograd (need to verify correctness) test=develop --- paddle/fluid/imperative/layer.cc | 98 ++++++++++++-------------------- 1 file changed, 37 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index c5a8d9c6b6..2176ac78bb 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -46,20 +46,16 @@ class Autograd { void RunBackward(VarBase* var) { PADDLE_ENFORCE(var->pre_op_->op_desc_); - // TODO(panyx0718): Only create vars that "require_grad" - std::vector op_grads = - CreateOpGrads(var->pre_op_->output_vars_->size()); - op_grads[var->pre_op_out_idx_] = var->grads_; + // TODO(panyx0718): Only create for vars that "require_grad" + (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; - std::deque>> ready; - ready.push_back(std::make_pair(var->pre_op_, op_grads)); + std::deque ready; + ready.push_back(var->pre_op_); std::map dep_counts = ComputeDepCounts(var->pre_op_); - std::map> visited; while (!ready.empty()) { - OpBase* ready_op = ready.front().first; - std::vector ready_op_grads = ready.front().second; + OpBase* ready_op = ready.front(); ready.pop_front(); std::vector input_grads = ready_op->ApplyGrad(scope_); @@ -67,29 +63,12 @@ class Autograd { if (!input_grads[i]) continue; OpBase* pre_op = ready_op->pre_ops_->at(i); if (!pre_op) continue; - int pre_op_out_idx = ready_op->pre_ops_out_idx_->at(i); dep_counts[pre_op] -= 1; PADDLE_ENFORCE(dep_counts[pre_op] >= 0); bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - if (visited.find(pre_op) == visited.end()) { - PADDLE_ENFORCE(pre_op->output_vars_->size() == 1); - visited[pre_op] = {input_grads[i]}; - } else { - std::vector& pre_op_grads = visited[pre_op]; - AccumGrads(pre_op_out_idx, input_grads[i], &pre_op_grads); - } - ready.push_back(std::make_pair(pre_op, visited[pre_op])); - } else { - if (visited.find(pre_op) == visited.end()) { - // TODO(panyx0718): Only create vars that "require_grad" - visited[pre_op] = CreateOpGrads(var->pre_op_->output_vars_->size()); - } else { - } - std::vector& pre_op_grads = visited[pre_op]; - AccumGrads(pre_op_out_idx, input_grads[i], &pre_op_grads); + ready.push_back(pre_op); } } } @@ -184,27 +163,22 @@ void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { std::vector OpBase::ApplyGrad(framework::Scope* scope) { VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (const std::string& invar : grad_op_desc_->InputArgumentNames()) { - block_->FindRecursiveOrCreateVar(invar); - framework::Variable* var = scope->Var(invar); - LOG(ERROR) << "op grad in var " << invar; - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(invar); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - LOG(ERROR) << "grad op invar init " << invar; - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; + for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { + if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { + continue; + } + LOG(ERROR) << "op grad in var " << grad_invar; + block_->FindRecursiveOrCreateVar(grad_invar); + framework::Variable* var = scope->Var(grad_invar); + const std::string& invar = grad_to_var_->at(grad_invar); + for (VarBase* varbase : *output_vars_) { + if (varbase->var_desc_->Name() == invar) { + var->GetMutable()->ShareDataWith( + varbase->grads_->Get()); } - } else { - var->GetMutable()->type(); } } - std::vector ret; - for (size_t i = 0; i < input_vars_->size(); ++i) { - ret.push_back(nullptr); - } for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { LOG(ERROR) << "grad outvar " << outvar; block_->FindRecursiveOrCreateVar(outvar); @@ -225,23 +199,25 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { opbase->Run(*scope, platform::CPUPlace()); - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - if (grad_to_var_->find(outvar) != grad_to_var_->end()) { - std::string origin_var = (*grad_to_var_)[outvar]; - for (size_t i = 0; i < input_vars_->size(); ++i) { - VarBase* origin_in_var = (*input_vars_)[i]; - if (origin_in_var->var_desc_->Name() == origin_var) { - framework::Variable* var = scope->FindVar(outvar); - LOG(ERROR) << "apply grad " << outvar << " with origin " - << origin_var; - origin_in_var->ApplyGrad(scope, var); - ret[i] = var; - // TODO(panyx0718): There might be 2 var with the same name. We - // currently assume the are the same Variable*. So it doesn't matter - // which one is used. - break; - } - } + std::vector ret; + for (size_t i = 0; i < input_vars_->size(); ++i) { + bool found = false; + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + Variable* var = scope->FindVar(outvar); + VarBase* origin_var = (*input_vars_)[i]; + std::string orig_var = grad_to_var_->at(outvar); + PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + LOG(ERROR) << "apply grad " << outvar << " with origin " << orig_var; + origin_var->ApplyGrad(scope, var); + found = true; + ret.push_back(var); + // TODO(panyx0718): There might be another outvar with the same name. + // In that case, it doesn't matter the first one or the second one is + // used. + break; + } + if (!found) { + ret.push_back(nullptr); } } return ret; From c583fd34acc9e02362fd2ddd4bf7adb53d8321e6 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 3 Dec 2018 09:53:24 +0800 Subject: [PATCH 080/719] add downpour sgd wrapper for pslib --- python/paddle/fluid/distributed/downpour.py | 34 ++++++++++++ python/paddle/fluid/distributed/node.py | 61 +++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 python/paddle/fluid/distributed/downpour.py create mode 100644 python/paddle/fluid/distributed/node.py diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py new file mode 100644 index 0000000000..523f686668 --- /dev/null +++ b/python/paddle/fluid/distributed/downpour.py @@ -0,0 +1,34 @@ +import paddle.fluid as fluid +import pslib_pb2 as pslib +from .node import DownpourServer +from .node import DownpourWorker +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table + +class DownpourSGD(object): + def __init__(self, optimizer=opt, learning_rate=0.001, window=1): + # todo(guru4elephant): if optimizer is not None, will warning here + self.learning_rate_ = opt.learning_rate + self.window_ = window + + def minimize(self, loss, startup_program=None, + parameter_list=None, no_grad_set=None, + prefetch_slots=None, prefetch_slots_emb=None): + params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) + table_name = fluid_distributed_lookup_table(loss.block.program) + server = DownpourServer() + worker = DownpourWorker() + server.add_sparse_table(0, learning_rate, + prefetch_slots, prefetch_slots_emb) + server.add_dense_table(1, learning_rate, params, grads) + worker.add_sparse_table(0, learning_rate, + prefetch_slots, prefetch_slots_emb) + worker.add_dense_table(1, learning_rate, params, grads) + + ps_param = pslib.PSParameter() + ps_param.server_param.CopyFrom(server.get_desc()) + ps_param.worker_param.CopyFrom(worker.get_desc()) + worker_skipped_ops = ["lookup_table", "lookup_table_grad"] + + return [solver_desc, parallel_desc] + + diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py new file mode 100644 index 0000000000..fc62d7220c --- /dev/null +++ b/python/paddle/fluid/distributed/node.py @@ -0,0 +1,61 @@ +import paddle.fluid as fluid +import pslib_pb2 as pslib + +class Server(object): + def __init__(self): + pass + + +class Worker(object): + def __init__(self): + pass + + +class DownpourServer(Server): + def __init__(self): + self.server_ = pslib.ServerParameter().downpour_server_param + + def add_sparse_table(self, table_id, learning_rate, + slot_key, slot_value_var, slot_grad_var): + table = self.server_.downpour_table_param.add() + table.table_id = table_id + table.type = PS_SPARSE_TABLE + table.accessor.accessor_class = "DownpourFeatureValueAccessor" + table.accessor.dense_sgd_param.adam.learning_rate = learning_rate + table.accessor.fea_dim = slot_value_var[0].shape[1] + + def add_dense_table(self, table_id, learning_rate, + param_var, grad_var): + table = self.server_.downpour_table_param.add() + table.table_id = table_id + table.type = PS_DENSE_TABLE + table.accessor.accessor_class = "DownpourDenseValueAccessor" + table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) + + def get_desc(self): + return self.server_ + + +class DownpourWorker(Worker): + def __init__(self, window): + self.window = window + self.worker_ = pslib.WorkerParameter().downpour_worker_param + self.worker_.pull_dense_per_batch = window + self.worker_.push_dense_per_batch = window + + def add_sparse_table(self, table_id, + slot_keys, slot_value_vars, slot_grad_vars): + table = self.worker_.sparse_table.add() + table.table_id = table_id + table.slot.extend(slot_keys) + self.worker_.extend([grad.name for grad in slot_grad_vars]) + + def add_dense_table(self, table_id, param_vars, grad_vars): + table = self.worker_.dense_table.add() + table.table_id = table_id + table.dense_variable_name.extend([p.name for p in param_vars]) + table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) + + def get_desc(self): + return self.worker_ From c7382df80f2e320adbd0f76c3d0daa5fb4958868 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Mon, 3 Dec 2018 10:59:36 +0800 Subject: [PATCH 081/719] Print assert failure id in lookup_table_op (#14698) --- paddle/fluid/operators/lookup_table_op.cu | 10 +++++----- paddle/fluid/platform/assert.h | 10 ++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 36156a1f61..6a0d6bad51 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids, while (idy < K) { int64_t id = ids[idy]; - PADDLE_ASSERT(id >= 0); - PADDLE_ASSERT(id < N); + PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id); + PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id); T *out = output + idy * D; const T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { @@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids, int idy = blockIdx.x + threadIdx.y * GridDimX; while (idy < K) { - int id = ids[idy]; - PADDLE_ASSERT(id >= 0); - PADDLE_ASSERT(id < N); + int64_t id = ids[idy]; + PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id); + PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id); const T *out = output + idy * D; T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 2ce9b31bb8..2e8fa7c1b8 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -36,6 +36,15 @@ limitations under the License. */ asm("trap;"); \ } \ } while (0) + +#define PADDLE_ASSERT_MSG_CODE(e, m, c) \ + do { \ + if (!(e)) { \ + printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \ + TOSTRING(e), m, c); \ + asm("trap;"); \ + } \ + } while (0) #else #include // For cuda, the assertions can affect performance and it is therefore @@ -43,4 +52,5 @@ limitations under the License. */ // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion #define PADDLE_ASSERT(e) assert((e)) #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m)) +#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1)) #endif From 669191c9cced3fdc25f63a551fd3741af55b302e Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Mon, 3 Dec 2018 11:54:00 +0800 Subject: [PATCH 082/719] Implement conv3d with mkldnn library (test=develop) --- paddle/fluid/operators/conv_mkldnn_op.cc | 145 ++++++++++++++---- paddle/fluid/operators/conv_op.cc | 16 ++ paddle/fluid/platform/mkldnn_helper.h | 12 ++ .../tests/unittests/test_conv3d_mkldnn_op.py | 59 +++++++ .../fluid/tests/unittests/test_conv3d_op.py | 9 +- 5 files changed, 211 insertions(+), 30 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 05e268bf6a..a0fa6d4e53 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -52,10 +52,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && filter->format() != memory::format::format_undef, "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(input->dims().size() == 4, - "Input must be with 4 dimensions, i.e. NCHW"); - PADDLE_ENFORCE(filter->dims().size() == 4, - "Filter must be with 4 dimensions, i.e. OIHW"); + PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, + "Input must be with 4 or 5dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); if (bias) { PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && bias->format() != memory::format::format_undef, @@ -71,9 +71,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); int groups = ctx.Attr("groups"); + bool is_conv3d = strides.size() == 3U; // TODO(tpatejko): add support for dilation PADDLE_ENFORCE( - dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + is_conv3d + ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && + dilations[2] == 1 + : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); const T* input_data = input->data(); @@ -84,16 +88,31 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); int g = std::max(groups, 1); if (g > 1) { - int o = weights_tz[0]; - int i = weights_tz[1]; - int h = weights_tz[2]; - int w = weights_tz[3]; - weights_tz.resize(5); - weights_tz[0] = g; - weights_tz[1] = o / g; - weights_tz[2] = i; - weights_tz[3] = h; - weights_tz[4] = w; + if (is_conv3d) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int d = weights_tz[2]; + int h = weights_tz[3]; + int w = weights_tz[4]; + weights_tz.resize(6); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = d; + weights_tz[4] = h; + weights_tz[5] = w; + } else { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } } std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); @@ -105,11 +124,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; + auto src_format = input->format(); + mkldnn::memory::format weights_format = + (g == 1) ? filter->format() : mkldnn::memory::format::goihw; + + if (is_conv3d) { + weights_format = + (g == 1) ? filter->format() : mkldnn::memory::format::goidhw; + } + auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), input->format()); + {src_tz}, platform::MKLDNNGetDataType(), src_format); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - (g == 1) ? filter->format() : mkldnn::memory::format::goihw); + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -119,10 +146,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); + weights_format = + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw; + + if (is_conv3d) { + chosen_memory_format = + platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + weights_format = + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goidhw; + } + auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), weights_format); std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. // Currently used whenever bias is != nullptr. auto dst_md = platform::MKLDNNMemDesc( @@ -263,8 +300,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const mkldnn::engine& engine, const bool fuse_relu, const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind) const { - memory::dims stride_dims = {strides[0], strides[1]}; - memory::dims padding_dims = {paddings[0], paddings[1]}; + memory::dims stride_dims = strides; + memory::dims padding_dims = paddings; auto conv_desc = mkldnn::convolution_forward::desc( fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst, @@ -288,8 +325,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const mkldnn::engine& engine, const bool fuse_relu, const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind) const { - memory::dims stride_dims = {strides[0], strides[1]}; - memory::dims padding_dims = {paddings[0], paddings[1]}; + memory::dims stride_dims = strides; + memory::dims padding_dims = paddings; auto conv_desc = mkldnn::convolution_forward::desc( fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst, @@ -349,6 +386,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); + bool is_conv3d = strides.size() == 3U; const T* input_data = input->data(); const T* filter_data = filter->data(); const T* output_grad_data = output_grad->data(); @@ -358,8 +396,45 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + if (g > 1) { + if (is_conv3d) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int d = weights_tz[2]; + int h = weights_tz[3]; + int w = weights_tz[4]; + weights_tz.resize(6); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = d; + weights_tz[4] = h; + weights_tz[5] = w; + } else { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } + } std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto src_format = input->format(); + mkldnn::memory::format weights_format = + (g == 1) ? filter->format() : mkldnn::memory::format::goihw; + + if (is_conv3d) { + weights_format = + (g == 1) ? filter->format() : mkldnn::memory::format::goidhw; + } + // Get an unique name from "argument" name of "Output" variable // as well as attributes of primitive to be created // This name will be used as key when saving info into device context @@ -372,9 +447,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // Create user memory descriptors auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), input->format()); + {src_tz}, platform::MKLDNNGetDataType(), src_format); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); auto user_diff_dst_md = platform::MKLDNNMemDesc( {dst_tz}, platform::MKLDNNGetDataType(), output_grad->format()); @@ -386,14 +461,24 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); + weights_format = + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw; + + if (is_conv3d) { + chosen_memory_format = + platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + weights_format = + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goidhw; + } + auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto diff_src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), weights_format); auto diff_weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), weights_format); auto diff_dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -496,3 +581,9 @@ REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::ConvMKLDNNGradOpKernel); + +REGISTER_OP_KERNEL(conv3d, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL(conv3d_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 342525be49..ca6ca2a5ae 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -229,6 +229,10 @@ $$ } void Conv3DOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddInput( "Input", "(Tensor) The input tensor of convolution operator. " @@ -247,6 +251,11 @@ void Conv3DOpMaker::Make() { AddOutput("Output", "(Tensor) The output tensor of convolution operator." "The format of output tensor is also NCDHW."); + AddInput("ResidualData", + "(Tensor) Tensor with residual data " + "to which convolution output will be added." + "Used with fuse_residual_connection fusion.") + .AsDispensable(); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " @@ -277,6 +286,13 @@ void Conv3DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_residual_connection", + "(bool, default false) Only used in mkldnn kernel. Used " + "whenever convolution output is as an input to residual " + "connection.") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 167bd4e81d..e53064893e 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -113,6 +113,18 @@ inline mkldnn::memory::format MKLDNNFormatForSize( return mkldnn::memory::format::x; } else if (dims_size == 2) { return mkldnn::memory::format::nc; + } else if (dims_size == 3) { + if (data_format == mkldnn::memory::format::nchw) { + return mkldnn::memory::format::ncw; + } else if (data_format == mkldnn::memory::format::nhwc) { + return mkldnn::memory::format::nwc; + } + } else if (dims_size == 5) { + if (data_format == mkldnn::memory::format::nchw) { + return mkldnn::memory::format::ncdhw; + } else if (data_format == mkldnn::memory::format::nhwc) { + return mkldnn::memory::format::ndhwc; + } } return data_format; } diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py new file mode 100644 index 0000000000..f0e1265e14 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1 + + +class TestMKLDNN(TestConv3dOp): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNCase1(TestCase1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNGroup1(TestWithGroup1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNGroup2(TestWithGroup2): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNWith1x1(TestWith1x1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 69c5ab7a4a..216d3ad2d0 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -74,6 +74,8 @@ class TestConv3dOp(OpTest): def setUp(self): self.op_type = "conv3d" self.use_cudnn = False + self.use_mkldnn = False + self.data_format = "AnyLayout" self.dtype = np.float32 self.init_kernel_type() self.init_group() @@ -83,8 +85,7 @@ class TestConv3dOp(OpTest): conv3d_param = { 'stride': self.stride, 'pad': self.pad, - 'dilations': self.dilations, - 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter + 'dilations': self.dilations } input = np.random.random(self.input_size).astype(self.dtype) @@ -101,7 +102,9 @@ class TestConv3dOp(OpTest): 'paddings': self.pad, 'groups': self.groups, 'dilations': self.dilations, - 'use_cudnn': self.use_cudnn + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format } self.outputs = {'Output': output} From 8d6984eb9b901e99e6204abe74d10edd63bee935 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 3 Dec 2018 13:40:53 +0800 Subject: [PATCH 083/719] change OpHasAttr to RuntimeHasAttr, add some comments test=develop --- paddle/fluid/framework/ir/is_test_pass.cc | 2 +- paddle/fluid/framework/ir/is_test_pass_tester.cc | 4 ++-- paddle/fluid/framework/ir/mkldnn_placement_pass.cc | 2 +- paddle/fluid/framework/ir/node.cc | 2 +- paddle/fluid/framework/ir/node.h | 12 +++++++++++- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index a61bd5f291..6d8f020918 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr IsTestPass::ApplyImpl( for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (n->OpHasAttr("is_test")) { + if (n->RuntimeHasAttr("is_test")) { op->SetAttr("is_test", true); } else if (std::find(begin(op_list), end(op_list), op->Type()) != end(op_list)) { diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index a5fb0abb3c..d9a68c7f1d 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { auto* op = node->Op(); auto op_name = boost::get(op->GetAttr("name")); if (op_name == "conv3") { - ASSERT_FALSE(node->OpHasAttr("is_test")); + ASSERT_FALSE(node->RuntimeHasAttr("is_test")); } else { - ASSERT_TRUE(node->OpHasAttr("is_test")); + ASSERT_TRUE(node->RuntimeHasAttr("is_test")); EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); } } diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 366057b01e..1cf1315d3d 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -22,7 +22,7 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { VLOG(3) << "Aplies MKL-DNN placement strategy."; for (const Node* n : graph->Nodes()) { - if (n->IsOp() && n->OpHasAttr("use_mkldnn")) { + if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { n->Op()->SetAttr("use_mkldnn", true); } } diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 4c4da10b04..7a88cb2b68 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,7 +30,7 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } -bool Node::OpHasAttr(const std::string &name) const { +bool Node::RuntimeHasAttr(const std::string &name) const { if (Op()->HasAttr(name)) { return true; } else { diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index ac08006a49..1044a96430 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -108,7 +108,17 @@ class Node { Name().find(ir::Node::kControlDepVarName) != std::string::npos; } - bool OpHasAttr(const std::string& name) const; + // RuntimeHasAttr is different with HasAttr now. + // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr, + // thus, if stored program_desc_ are old which don't have an attr, a new + // library which adds the attr already will fail on this function. + // Details: + // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087 + // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above + // problem. + // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding + // RuntimeHasAttr. + bool RuntimeHasAttr(const std::string& name) const; std::vector inputs; std::vector outputs; From 64e261c6cdaae175dcd1c2ec78b67be3bb0ae36d Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Mon, 3 Dec 2018 13:57:58 +0800 Subject: [PATCH 084/719] Implement the fusion of convolution and bias for mkldnn (test=develop) --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/conv3d_bias_mkldnn_fuse_pass.cc | 18 ++++++++++++ .../ir/conv3d_bias_mkldnn_fuse_pass.h | 29 +++++++++++++++++++ .../ir/conv_bias_mkldnn_fuse_pass.cc | 8 +++-- .../framework/ir/conv_bias_mkldnn_fuse_pass.h | 1 + .../framework/ir/graph_pattern_detector.cc | 25 +++++++++------- .../framework/ir/graph_pattern_detector.h | 2 +- .../fluid/inference/api/paddle_pass_builder.h | 7 +++-- .../tests/api/analyzer_dam_tester.cc | 12 ++++++-- 9 files changed, 83 insertions(+), 20 deletions(-) create mode 100644 paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 883575e41d..0bbfe3c0e2 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -46,6 +46,7 @@ if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) pass_library(conv_bias_mkldnn_fuse_pass inference) + pass_library(conv3d_bias_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) endif() diff --git a/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000..e2968ddf60 --- /dev/null +++ b/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc @@ -0,0 +1,18 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h" + +REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass, + paddle::framework::ir::Conv3DBiasFusePass); diff --git a/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h new file mode 100644 index 0000000000..5afe2cc61e --- /dev/null +++ b/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h @@ -0,0 +1,29 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { +/* +* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. +*/ +class Conv3DBiasFusePass : public ConvBiasFusePass { + public: + bool is_conv3d() const override { return true; } +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc index 449cc78be1..c3ad3a0f18 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc @@ -46,14 +46,16 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( auto* scope = param_scope(); PADDLE_ENFORCE(scope); + std::string type = is_conv3d() ? "conv3d" : "conv2d"; + GraphPatternDetector gpd; auto* conv_input = gpd.mutable_pattern() ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) ->AsInput() - ->assert_is_op_input("conv2d", "Input"); + ->assert_is_op_input(type, "Input"); patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_); - conv_bias_pattern(conv_input); + conv_bias_pattern(conv_input, is_conv3d()); int found_conv_bias_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -109,7 +111,7 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( desc.SetInput("Filter", std::vector({conv_weight->Name()})); desc.SetInput("Bias", std::vector({eltwise_bias->Name()})); desc.SetOutput("Output", std::vector({eltwise_out->Name()})); - desc.SetType("conv2d"); + desc.SetType(type); for (auto& attr : conv->Op()->GetAttrMap()) { desc.SetAttr(attr.first, attr.second); diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h index 5775b83b88..c3b58bf582 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h @@ -26,6 +26,7 @@ namespace ir { class ConvBiasFusePass : public FusePassBase { public: virtual ~ConvBiasFusePass() {} + virtual bool is_conv3d() const { return false; } protected: std::unique_ptr ApplyImpl(std::unique_ptr graph) const; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 258182b25a..ed99d9883b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1030,23 +1030,26 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( } PDNode *patterns::ConvBias::operator()( - paddle::framework::ir::PDNode *conv_input) { + paddle::framework::ir::PDNode *conv_input, bool is_conv3d) { // Create Operators - conv_input->assert_is_op_input("conv2d", "Input"); - auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + conv_input->assert_is_op_input(is_conv3d ? "conv3d" : "conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr()) + ->assert_is_op(is_conv3d ? "conv3d" : "conv2d"); auto *eltiwse_op = pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); // Create variables // Filter - auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) - ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input("conv2d", "Filter"); + auto *conv_weight_var = + pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input(is_conv3d ? "conv3d" : "conv2d", "Filter"); // intermediate variable, will be removed in the IR after fuse. - auto *conv_out_var = pattern->NewNode(conv_out_repr()) - ->AsIntermediate() - ->assert_is_only_output_of_op("conv2d") - ->assert_is_op_input("elementwise_add"); + auto *conv_out_var = + pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(is_conv3d ? "conv3d" : "conv2d") + ->assert_is_op_input("elementwise_add"); // Bias stored in elementwise_add auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr()) ->AsInput() diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index c12b9503fd..d044802f22 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -623,7 +623,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase { struct ConvBias : public PatternBase { ConvBias(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "conv_bias") {} - PDNode* operator()(PDNode* conv_input); + PDNode* operator()(PDNode* conv_input, bool is_conv3d = false); // declare operator node's name PATTERN_DECL_NODE(conv); PATTERN_DECL_NODE(eltwise); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 825bee833b..bc5139a7e5 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -98,9 +98,10 @@ class CpuPassStrategy : public PassStrategy { passes_.insert(passes_.begin(), "mkldnn_placement_pass"); for (auto &pass : - std::vector({"depthwise_conv_mkldnn_pass", // - "conv_bias_mkldnn_fuse_pass", // - "conv_relu_mkldnn_fuse_pass", // + std::vector({"depthwise_conv_mkldnn_pass", // + "conv_bias_mkldnn_fuse_pass", // + "conv3d_bias_mkldnn_fuse_pass", // + "conv_relu_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass"})) { passes_.push_back(pass); } diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index a3a6130db7..6186c6a42a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -222,9 +222,12 @@ TEST(Analyzer_dam, fuse_statis) { } // Compare result of NativeConfig and AnalysisConfig -TEST(Analyzer_dam, compare) { - contrib::AnalysisConfig cfg; +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } std::vector> input_slots_all; SetInput(&input_slots_all); @@ -233,5 +236,10 @@ TEST(Analyzer_dam, compare) { reinterpret_cast(&cfg), input_slots_all); } +TEST(Analyzer_dam, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + } // namespace inference } // namespace paddle From b80fe8264ae636565fff118b2b2fdf71bbf39541 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 3 Dec 2018 14:07:36 +0800 Subject: [PATCH 085/719] polish test=develop --- paddle/fluid/framework/framework.proto | 2 +- paddle/fluid/imperative/tracer.h | 53 ++++++++++-------- paddle/fluid/pybind/imperative.cc | 19 ++----- python/paddle/fluid/framework.py | 16 +++--- python/paddle/fluid/imperative/base.py | 29 +++++++++- python/paddle/fluid/imperative/layers.py | 55 ++++++------------- python/paddle/fluid/layer_helper.py | 3 +- python/paddle/fluid/layers/nn.py | 1 - .../fluid/tests/unittests/test_imperative.py | 4 +- python/setup.py.in | 1 + 10 files changed, 96 insertions(+), 87 deletions(-) diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 6c60a041a1..efdabffb9b 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto2"; -// option optimize_for = LITE_RUNTIME; +option optimize_for = LITE_RUNTIME; package paddle.framework.proto; // Any incompatible changes to ProgramDesc and its dependencies should diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index c3cc0cb33e..ff87993ffc 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -43,24 +43,31 @@ void CreateGradOp(const framework::OpDesc& op_desc, class Tracer { public: - Tracer() {} + explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + root_scope_ = new framework::Scope(); + scopes_[root_block_] = root_scope_; + } + + virtual ~Tracer() { delete root_scope_; } void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs) { + const std::vector& outputs, + framework::BlockDesc* block) { + framework::Scope* scope = GetScope(block); framework::OpDesc* op_desc = op->op_desc_; LOG(ERROR) << "tracer tracing " << op_desc->Type(); - op_desc->InferShape(*block_); - op_desc->InferVarType(block_); + op_desc->InferShape(*block); + op_desc->InferVarType(block); std::unique_ptr op_base = framework::OpRegistry::CreateOp(*op_desc); *op->input_vars_ = inputs; for (VarBase* input : inputs) { const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope_->Var(vname); + framework::Variable* var = scope->Var(vname); input->var_ = var; if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(vname); + framework::VarDesc* var_desc = block->FindVar(vname); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { var->GetMutable(); } else { @@ -78,9 +85,9 @@ class Tracer { *op->output_vars_ = outputs; for (size_t i = 0; i < outputs.size(); ++i) { const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope_->Var(vname); + framework::Variable* var = scope->Var(vname); if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(vname); + framework::VarDesc* var_desc = block->FindVar(vname); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { var->GetMutable(); } else { @@ -91,28 +98,30 @@ class Tracer { outputs[i]->pre_op_ = op; outputs[i]->pre_op_out_idx_ = i; } - op_base->Run(*scope_, platform::CPUPlace()); - + op_base->Run(*scope, platform::CPUPlace()); framework::OpDesc* grad_op_desc; auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block_}, &grad_op_desc, grad_to_var); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); op->grad_op_desc_ = grad_op_desc; op->grad_to_var_ = grad_to_var; - op->block_ = block_; + op->block_ = block; } - void SetScope(framework::Scope* scope) { scope_ = scope; } - - void SetBlock(framework::BlockDesc* block) { block_ = block; } - - framework::Scope* Scope() const { return scope_; } - - framework::BlockDesc* Block() const { return block_; } + framework::Scope* GetScope(framework::BlockDesc* block) { + if (scopes_.find(block) != scopes_.end()) { + return scopes_.at(block); + } + framework::BlockDesc* parent_block = block->ParentBlock(); + PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); + framework::Scope* scope = &scopes_[parent_block]->NewScope(); + scopes_[block] = scope; + return scope; + } private: - framework::BlockDesc* block_; - framework::Scope* scope_; - std::vector runnables_; + std::map scopes_; + framework::BlockDesc* root_block_; + framework::Scope* root_scope_; }; } // namespace imperative diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 0e0f5a69a7..34e9c897d9 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -23,20 +23,13 @@ namespace pybind { // Bind Methods void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") - .def(pybind11::init<>()) + .def("__init__", + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); + }) .def("trace", &imperative::Tracer::Trace) - .def_property("scope", - [](const imperative::Tracer &self) { return self.Scope(); }, - [](imperative::Tracer &self, framework::Scope *scope) { - self.SetScope(scope); - }, - R"DOC()DOC") - .def_property("block", - [](const imperative::Tracer &self) { return self.Block(); }, - [](imperative::Tracer &self, framework::BlockDesc *block) { - self.SetBlock(block); - }, - R"DOC()DOC"); + .def("get_scope", &imperative::Tracer::GetScope, + pybind11::return_value_policy::reference); } } // namespace pybind diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ba3ffafc85..9d1b86abf9 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -358,11 +358,13 @@ class Variable(core.VarBase): self.stop_gradient = stop_gradient self.is_data = is_data - def numpy(self, scope): + def numpy(self): + scope = _imperative_tracer().get_scope(self.block.desc) tensor = core.get_variable_tensor(scope, self.desc.name()) return np.array(tensor) - def backward(self, scope): + def backward(self): + scope = _imperative_tracer().get_scope(self.block.desc) self._run_backward(scope) def grad(self): @@ -668,14 +670,14 @@ class Operator(core.OpBase): for inp in inputs.values(): if isinstance(inp, Variable): input_vars.append(inp) - elif isinstance(inp, list): + elif isinstance(inp, list) or isinstance(inp, tuple): input_vars.extend(inp[:]) self.inputs = input_vars output_vars = [] for out in outputs.values(): if isinstance(out, Variable): output_vars.append(out) - elif isinstance(inp, list): + elif isinstance(out, list) or isinstance(out, tuple): output_vars.extend(out[:]) self.outputs = output_vars @@ -1246,7 +1248,7 @@ class Block(object): if _in_imperative_mode(): op_desc = core.OpDesc() op = Operator(block=self, desc=op_desc, *args, **kwargs) - _imperative_tracer().trace(op, op.inputs, op.outputs) + _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) else: op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) @@ -2257,9 +2259,9 @@ def _get_var(name, program=None): @contextlib.contextmanager -def _imperative_guard(): +def _imperative_guard(tracer): global _imperative_tracer_ tmp_trace = _imperative_tracer_ - _imperative_tracer_ = core.Tracer() + _imperative_tracer_ = tracer yield _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index 900a65a3aa..15d38ddb56 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib +import numpy as np + from paddle.fluid import core from paddle.fluid import framework -__all__ = ['enabled', 'guard'] +__all__ = ['enabled', 'guard', 'to_variable'] def enabled(): @@ -26,8 +28,29 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() + tracer = core.Tracer(train.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): - with framework._imperative_guard(): + with framework._imperative_guard(tracer): yield - # TODO: check train, startup not changed. + + +def to_variable(value, block=None): + if isinstance(value, np.ndarray): + if not block: + block = framework.default_main_program().current_block() + py_var = framework.Variable( + block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=value.shape, + dtype=value.dtype) + scope = framework._imperative_tracer().get_scope(block.desc) + var = scope.var(py_var.name) + tensor = var.get_tensor() + tensor.set(value, core.CPUPlace()) + return py_var + elif isinstance(value, framework.Variable): + return value + else: + raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 15f6939846..cb54a36a5e 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -18,51 +18,32 @@ import numpy as np from paddle.fluid import core from paddle.fluid import framework +from paddle.fluid.imperative import base __all__ = ['PyLayer'] -@contextlib.contextmanager -def trace_scope(scope, block): - tmp_scope = framework._imperative_tracer().scope - tmp_block = framework._imperative_tracer().block - framework._imperative_tracer().scope = scope - framework._imperative_tracer().block = block - yield - framework._imperative_tracer().scope = tmp_scope - framework._imperative_tracer().block = tmp_block - - class PyLayer(core.Layer): def __init__(self): - self._scope = core.Scope() - self._block = framework.default_main_program().current_block() + pass def __call__(self, inputs): - with trace_scope(self._scope, self._block.desc): - if not isinstance(inputs, list) and not isinstance(inputs, tuple): - inputs = [inputs] - - var_inputs = [] - for x in inputs: - if isinstance(x, np.ndarray): - py_var = framework.Variable( - self._block, - type=core.VarDesc.VarType.LOD_TENSOR, - name=None, - shape=x.shape, - dtype=x.dtype) - var = self._scope.var(py_var.name) - tensor = var.get_tensor() - tensor.set(x, core.CPUPlace()) - var_inputs.append(py_var) - elif isinstance(x, framework.Variable): - var_inputs.append(x) - else: - raise ValueError("not var or ndarray %s" % type(x)) - outputs = self.forward(var_inputs) - return outputs + # TODO(panyx0718): Support declarative mode as well. + assert base.enabled() + if not isinstance(inputs, list) and not isinstance(inputs, tuple): + inputs = [inputs] + + var_inputs = [] + for x in inputs: + if isinstance(x, np.ndarray): + py_var = base.to_variable(x) + var_inputs.append(py_var) + elif isinstance(x, framework.Variable): + var_inputs.append(x) + else: + raise ValueError("not var or ndarray %s" % type(x)) + outputs = self.forward(var_inputs) + return outputs def forward(self, inputs): - print("at python.") return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index ca531f3be9..25fc843bf5 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -23,6 +23,7 @@ import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier +from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -62,7 +63,7 @@ class LayerHelper(object): if isinstance(x, Variable): return x elif isinstance(x, np.ndarray): - return self._np_to_variable(x) + return base.to_variable(x, self.main_program.current_block()) else: raise ValueError("inputs wrong type %s\n" % x) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 69ac968966..35232bd489 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -17,7 +17,6 @@ All layers just related to the neural network. from __future__ import print_function -import sys import numpy as np import os from ..layer_helper import LayerHelper diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 18fe8f7c09..5413bdc24e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -43,8 +43,8 @@ class TestImperative(unittest.TestCase): l = MyLayer() x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x.numpy(scope=l._scope))) - x.backward(l._scope) + sys.stderr.write("%s output: %s\n" % (x, x.numpy())) + x.backward() sys.stderr.write("grad %s\n" % l._x_for_debug.grad()) diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54..69c656a9cb 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,6 +101,7 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', + 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', From ea00270fe8cf1130d0d6741811acaccaad50c0e3 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Mon, 3 Dec 2018 14:16:07 +0800 Subject: [PATCH 086/719] Remove the dims checking when the dim is 3 (test=develop) --- paddle/fluid/operators/activation_mkldnn_op.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index 64649b1a5e..7fa81e185e 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -100,9 +100,6 @@ void eltwise_forward(const framework::ExecutionContext &ctx, const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); - PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4, - "Input dim must be with 2 or 4"); - std::vector src_tz = framework::vectorize2int(x->dims()); auto src_format = From 35e6b5e16ac16a3317a447d504f35a6a2a369729 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 3 Dec 2018 14:52:26 +0800 Subject: [PATCH 087/719] polish test=develop --- paddle/fluid/imperative/layer.cc | 30 ++++++++---------------------- paddle/fluid/imperative/layer.h | 5 +---- paddle/fluid/imperative/tracer.h | 2 +- tools/print_signatures.py | 4 ++++ 4 files changed, 14 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 2176ac78bb..6125037680 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -75,16 +75,6 @@ class Autograd { } private: - void AccumGrads(int grad_idx, Variable* grad, - std::vector* op_grads) { - if (!(*op_grads)[grad_idx]) { - // FIXME(panyx0718): This should be a deep copy. - (*op_grads)[grad_idx] = grad; - return; - } - AddTo(grad, (*op_grads)[grad_idx]); - } - std::map ComputeDepCounts(OpBase* op) { std::map ret; @@ -108,14 +98,6 @@ class Autograd { return ret; } - std::vector CreateOpGrads(size_t count) { - std::vector op_grads; - for (size_t i = 0; i < count; ++i) { - op_grads.push_back(nullptr); - } - return op_grads; - } - framework::Scope* scope_; }; @@ -133,7 +115,7 @@ framework::Variable* CreateVariable(const std::string& name, varname = string::Sprintf("%s@%d", varname, id); } - LOG(ERROR) << "creating var " << varname; + VLOG(3) << "creating var " << varname; framework::Variable* var = scope->Var(varname); framework::LoDTensor* tensor = var->GetMutable(); @@ -165,22 +147,25 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { + // grad op inputs can be forward inputs, so not in grad_to_var. continue; } - LOG(ERROR) << "op grad in var " << grad_invar; + VLOG(3) << "op grad in var " << grad_invar; block_->FindRecursiveOrCreateVar(grad_invar); framework::Variable* var = scope->Var(grad_invar); const std::string& invar = grad_to_var_->at(grad_invar); for (VarBase* varbase : *output_vars_) { + // Use the accumulated grads_ by sharing the input with grads_. if (varbase->var_desc_->Name() == invar) { var->GetMutable()->ShareDataWith( varbase->grads_->Get()); + break; } } } for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - LOG(ERROR) << "grad outvar " << outvar; + VLOG(3) << "grad outvar " << outvar; block_->FindRecursiveOrCreateVar(outvar); framework::Variable* var = scope->Var(outvar); if (!var->IsInitialized()) { @@ -199,6 +184,7 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { opbase->Run(*scope, platform::CPUPlace()); + // `ret` matches exactly with `input_vars_` of forward op. std::vector ret; for (size_t i = 0; i < input_vars_->size(); ++i) { bool found = false; @@ -207,7 +193,7 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { VarBase* origin_var = (*input_vars_)[i]; std::string orig_var = grad_to_var_->at(outvar); PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); - LOG(ERROR) << "apply grad " << outvar << " with origin " << orig_var; + VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; origin_var->ApplyGrad(scope, var); found = true; ret.push_back(var); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 0cf0c27a6a..85a71ca83d 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -36,10 +36,7 @@ class VarBase { var_(nullptr), grads_(nullptr) {} - virtual ~VarBase() { - LOG(ERROR) << "deleting var"; - LOG(ERROR) << "done deleting var"; - } + virtual ~VarBase() {} void ApplyGrad(framework::Scope* scope, framework::Variable* grad); diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index ff87993ffc..433d07c0e5 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -55,7 +55,7 @@ class Tracer { framework::BlockDesc* block) { framework::Scope* scope = GetScope(block); framework::OpDesc* op_desc = op->op_desc_; - LOG(ERROR) << "tracer tracing " << op_desc->Type(); + VLOG(3) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); op_desc->InferVarType(block); std::unique_ptr op_base = diff --git a/tools/print_signatures.py b/tools/print_signatures.py index e2805c4e7e..90c6626ecd 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -27,6 +27,8 @@ import pydoc member_dict = collections.OrderedDict() +experimental_namespace = {"paddle.fluid.imperative"} + def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -50,6 +52,8 @@ def visit_member(parent_name, member): def visit_all_module(mod): + if (mod.__name__ in experimental_namespace): + return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) From 82eefceabedfbed60d9edb6ec219cfc4920d27aa Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Mon, 3 Dec 2018 15:05:55 +0800 Subject: [PATCH 088/719] Add the profile_mkldnn flag for profile function(test=develop) --- .../fluid/inference/tests/api/analyzer_dam_tester.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 6186c6a42a..e8abcfce05 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -188,10 +188,14 @@ void SetInput(std::vector> *inputs) { } // Easy for profiling independently. -TEST(Analyzer_dam, profile) { +void profile(bool use_mkldnn = false) { contrib::AnalysisConfig cfg; SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -209,6 +213,11 @@ TEST(Analyzer_dam, profile) { } } +TEST(Analyzer_dam, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + // Check the fuse status TEST(Analyzer_dam, fuse_statis) { contrib::AnalysisConfig cfg; From 7464bd29e865499972043e634a9f23d71ee38b05 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 3 Dec 2018 15:44:20 +0800 Subject: [PATCH 089/719] polish test=develop --- python/paddle/fluid/framework.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 9d1b86abf9..f7c86d19f7 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -612,6 +612,7 @@ class Operator(core.OpBase): return True return False + self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -638,6 +639,13 @@ class Operator(core.OpBase): else: self.desc.set_input(in_proto.name, []) + for inp in inputs.values(): + if isinstance(inp, Variable): + self.inputs.append(inp) + elif isinstance(inp, list) or isinstance(inp, tuple): + self.inputs.extend(inp[:]) + + self.outputs = [] if outputs is not None: given = set() need = set() @@ -666,20 +674,11 @@ class Operator(core.OpBase): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - input_vars = [] - for inp in inputs.values(): - if isinstance(inp, Variable): - input_vars.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - input_vars.extend(inp[:]) - self.inputs = input_vars - output_vars = [] - for out in outputs.values(): - if isinstance(out, Variable): - output_vars.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - output_vars.extend(out[:]) - self.outputs = output_vars + for out in outputs.values(): + if isinstance(out, Variable): + self.outputs.append(out) + elif isinstance(out, list) or isinstance(out, tuple): + self.outputs.extend(out[:]) if op_attrs is not None: if not isinstance(op_attrs, dict): From bcf36d8401d1d3ebaaaf7121dd88ab8eeeae3e36 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 3 Dec 2018 15:53:56 +0800 Subject: [PATCH 090/719] add more files to protected file list test=develop --- paddle/fluid/framework/selected_rows.h | 3 +-- paddle/scripts/paddle_build.sh | 12 +++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 44384082db..e1bdba9b46 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -32,8 +32,7 @@ namespace framework { class SelectedRows { /* * @brief We can use the SelectedRows structure to reproduce a sparse table. - * A sparse table is a key-value structure that the key is an `int64_t` - * number, + * A sparse table is a key-value structure that the key is an `int64_t`, * and the value is a Tensor which the first dimension is 0. * You can use the following interface to operate the sparse table, and you * can find diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index dbb73f7a27..4175ee47b2 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -487,7 +487,17 @@ function assert_api_spec_approvals() { BRANCH="develop" fi - API_FILES=("paddle/fluid/API.spec" "paddle/fluid/framework/operator.h") + API_FILES=("paddle/fluid/API.spec" + "paddle/fluid/framework/operator.h" + "paddle/fluid/framework/tensor.h" + "paddle/fluid/framework/lod_tensor.h" + "paddle/fluid/framework/selected_rows.h" + "paddle/fluid/framework/op_desc.h" + "paddle/fluid/framework/block_desc.h" + "paddle/fluid/framework/var_desc.h" + "paddle/fluid/framework/scope.h" + "paddle/fluid/framework/ir/node.h" + "paddle/fluid/framework/ir/graph.h") for API_FILE in ${API_FILES[*]}; do API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true` echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" From f75815b78c5a90c71f88e297c6ba23fb065862e4 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 3 Dec 2018 08:52:18 +0000 Subject: [PATCH 091/719] add prelu gpu inference --- .../tensorrt/convert/test_prelu_op.cc | 3 +- .../inference/tensorrt/plugin/CMakeLists.txt | 2 +- .../tensorrt/plugin/prelu_op_plugin.cu | 100 +++--------- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 1 + paddle/fluid/operators/math/prelu.cu | 148 ++++++++++++++++++ paddle/fluid/operators/math/prelu.h | 49 ++++++ paddle/fluid/operators/prelu_op.cc | 2 +- paddle/fluid/operators/prelu_op.cu | 64 ++++++++ 9 files changed, 284 insertions(+), 87 deletions(-) create mode 100644 paddle/fluid/operators/math/prelu.cu create mode 100644 paddle/fluid/operators/math/prelu.h create mode 100644 paddle/fluid/operators/prelu_op.cu diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc index 453f222f1f..b086c910d3 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc @@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) { } // namespace inference } // namespace paddle -// USE_OP(prelu); -USE_CPU_ONLY_OP(prelu); +USE_OP(prelu); diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index e822785ad6..95443e8133 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,4 +1,4 @@ nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu avg_pool_op_plugin.cu - DEPS enforce tensorrt_engine) + DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index e8f4254402..3075e87ea6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -14,92 +14,16 @@ #include #include +#include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/operators/math/prelu.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { -static const int CUDA_NUM_THREADS = 1024; -static const int CUDA_MAX_NUM_BLOCKS = 65535; -inline static int GET_NUM_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -__global__ void PReluChannelWiseKernel(const float *input, const float *alpha, - float *output, int channel, - size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - float *out = output + offset; - float scale = alpha[blockIdx.x % channel]; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale * x; - } -} - -__global__ void PReluElementWiseKernel(const float *input, const float *alpha, - float *output, size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - const float *scale = alpha + offset; - float *out = output + offset; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale[i] * x; - } -} - -__global__ void PReluScalarKernel(const float *input, const float *alpha, - float *output, size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - float scale = *alpha; - float *out = output + offset; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale * x; - } -} - -static inline void PReluChannelWise(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, - const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluChannelWiseKernel<<>>( - input, alpha, output, dims.d[0], spatial_size); -} - -static inline void PReluElementWise(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, - const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluElementWiseKernel<<>>( - input, alpha, output, spatial_size); -} - -static inline void PReluScalar(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluScalarKernel<<>>( - input, alpha, output, spatial_size); -} - nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, return output_dims; } -int PReluPlugin::enqueue(int batchSize, const void *const *inputs, +int PReluPlugin::enqueue(int batch_size, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); const float *alpha = reinterpret_cast(alpha_.get().values); float *output = reinterpret_cast(outputs)[0]; + + std::vector input_shape; + input_shape.push_back(batch_size); + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape.push_back(input_dims.d[i]); + } + if (mode_ == "channel") { - PReluChannelWise(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluChannelWiseDirectCUDAFunctor + prelu_channel_wise; + prelu_channel_wise(stream, input, alpha, output, input_shape); } else if (mode_ == "element") { - PReluElementWise(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluElementWiseDirectCUDAFunctor + prelu_element_wise; + prelu_element_wise(stream, input, alpha, output, input_shape); } else { - PReluScalar(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(stream, input, alpha, output, input_shape); } return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8c8dc7026e..257bfc0a3f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -70,7 +70,7 @@ endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) endif() # FIXME(typhoonzero): operator deps may not needed. diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 63363086ad..b3d2ea38eb 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -59,6 +59,7 @@ math_library(matrix_bit_code) math_library(unpooling) math_library(vol2col) +math_library(prelu) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu new file mode 100644 index 0000000000..701a802080 --- /dev/null +++ b/paddle/fluid/operators/math/prelu.cu @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/prelu.h" + +namespace paddle { +namespace operators { +namespace math { + +static const int CUDA_NUM_THREADS = 1024; +static const int CUDA_MAX_NUM_BLOCKS = 65535; +inline static int GET_NUM_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void PReluChannelWiseKernel(const T *input, const T *alpha, + T *output, int channel, + size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + T *out = output + offset; + T scale = alpha[blockIdx.x % channel]; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale * x; + } +} + +template +__global__ void PReluElementWiseKernel(const T *input, const T *alpha, + T *output, size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + const T *scale = alpha + offset; + T *out = output + offset; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale[i] * x; + } +} + +template +__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output, + size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + T scale = *alpha; + T *out = output + offset; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale * x; + } +} + +template +static inline void PReluChannelWise(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluChannelWiseKernel<<>>( + input, alpha, output, input_shape[1], spatial_size); +} + +template +static inline void PReluElementWise(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluElementWiseKernel<<>>( + input, alpha, output, spatial_size); +} + +template +static inline void PReluScalar(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluScalarKernel<<>>( + input, alpha, output, spatial_size); +} + +template +void PreluChannelWiseDirectCUDAFunctor::operator()( + cudaStream_t stream, const T *input, const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluChannelWiseKernel<<>>( + input, alpha, output, input_shape[1], spatial_size); +} + +template +void PreluElementWiseDirectCUDAFunctor::operator()( + cudaStream_t stream, const T *input, const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluElementWiseKernel<<>>( + input, alpha, output, spatial_size); +} + +template +void PreluScalarDirectCUDAFunctor::operator()(cudaStream_t stream, + const T *input, const T *alpha, + T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluScalarKernel<<>>( + input, alpha, output, spatial_size); +} + +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; + +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; + +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h new file mode 100644 index 0000000000..3237c6d4cb --- /dev/null +++ b/paddle/fluid/operators/math/prelu.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +#ifdef PADDLE_WITH_CUDA +template +class PreluChannelWiseDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; + +template +class PreluElementWiseDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; + +template +class PreluScalarDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; +#endif + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 58cfbb76e9..64d94ab604 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu new file mode 100644 index 0000000000..36b5259ae5 --- /dev/null +++ b/paddle/fluid/operators/prelu_op.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/prelu.h" +#include "paddle/fluid/operators/prelu_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CUDAPReluKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* alpha = context.Input("Alpha"); + auto* out = context.Output("Out"); + + const T* x_ptr = x->data(); + T* o_ptr = out->mutable_data(context.GetPlace()); + + const T* alpha_ptr = alpha->data(); + auto& mode = context.Attr("mode"); + + int numel = x->numel(); + auto dim = x->dims(); + std::vector input_shape = framework::vectorize2int(dim); + + if (mode == "channel") { + math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; + prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, + alpha_ptr, o_ptr, input_shape); + } else if (mode == "element") { + math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; + prelu_element_wise(context.cuda_device_context().stream(), x_ptr, + alpha_ptr, o_ptr, input_shape); + } else { + math::PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, + o_ptr, input_shape); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + prelu, ops::CUDAPReluKernel, + ops::CUDAPReluKernel); From a77fa67bbd11131f0c8c3683b903b2ceeeca41a0 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 3 Dec 2018 17:44:18 +0800 Subject: [PATCH 092/719] async_thread_trainer & libmct & pslib.cmake --- cmake/external/libmct.cmake | 17 +- cmake/external/pslib.cmake | 2 +- paddle/fluid/framework/async_executor.cc | 83 +++- paddle/fluid/framework/async_executor.h | 41 +- .../fluid/framework/executor_thread_worker.cc | 456 ++++++++++++++++++ .../fluid/framework/executor_thread_worker.h | 150 +++++- paddle/fluid/pybind/async_executor_py.cc | 6 +- python/paddle/fluid/async_executor.py | 13 + 8 files changed, 745 insertions(+), 23 deletions(-) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index 351806f6e1..239183cb6d 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -40,9 +40,6 @@ SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) -SET(LIBMCT_LIB_DIR ${LIBMCT_ROOT}/lib) -SET(LIBMCT_LIB ${LIBMCT_LIB_DIR}/libps.so) -SET(LIBMCT_IOMP_LIB ${LIBMCT_LIB_DIR}/libiomp5.so) #todo what is this SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) @@ -66,11 +63,15 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} ) -ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET libmct PROPERTY IMPORTED_LOCATION ${LIBMCT_LIB}) +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(libmct STATIC ${dummyfile}) +else() + add_library(libmct INTERFACE) +endif() + +#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) LIST(APPEND external_project_dependencies libmct) -IF(WITH_C_API) - INSTALL(FILES ${LIBMCT_LIB} ${LIBMCT_IOMP_LIB} DESTINATION lib) -ENDIF() diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 812af5efa2..586f66d6fd 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -66,7 +66,7 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} ) -ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) +ADD_LIBRARY(pslib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) LIST(APPEND external_project_dependencies pslib) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index aa76e03e83..94ed8c2fca 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -48,6 +48,10 @@ void AsyncExecutor::CreateThreads( worker->SetDataFeed(reader); worker->SetFetchVarNames(fetch_var_names); worker->BindingDataFeedMemory(); + worker->SetPSlibPtr(_pslib_ptr); + worker->SetPullDenseThread(_pull_dense_thread); + worker->BindingSlotVariableMemory(); + worker->SetParamConfig(&_param_config); } void PrepareReaders(std::vector>& readers, // NOLINT @@ -61,6 +65,77 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } +void AsyncExecutor::ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { + _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); + _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO +} + +void AsyncExecutor::StartServer() { + _pslib_ptr->run_server(); +} + +void AsyncExecutor::InitModel() { + //TODO only rank = 0 do this + std::vector all_dense_table_id; //TODO + all_dense_table_id.push_back(0); + for (auto table_id: all_dense_table_id) { + std::vector regions; + std::vector variables; //TODO + for (auto& t : variables) { + Variable* var = root_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + + float* g = tensor->data(); + CHECK(g != nullptr) << "var[" << t << "] value not initialized"; + + float init_range = 0.2; + int rown = tensor->dims()[0]; + init_range /= sqrt(rown); + + std::normal_distribution ndistr(0.0, 1.0); + for (auto i = 0u; i < tensor->numel(); ++i) { + g[i] = ndistr(local_random_engine()) * init_range; + } + + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + + auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + if (status != 0) { + LOG(FATAL) << "push dense param failed, status[" << status << "]"; + exit(-1); + } + } +} + +void AsyncExecutor::SaveModel(const std::string& path) { + auto ret = _pslib_ptr->_worker_ptr->flush(); + ret.wait(); + ret = _pslib_ptr->_worker_ptr->save(path, 0); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { // TODO should be feasign_cnt < 0, because server bug + LOG(FATAL) << "save model failed"; + exit(-1); + } +} + +void AsyncExecutor::PrepareDenseThread() { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr;; + param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO + + _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + +} + void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::string& data_feed_desc_str, const std::vector& filelist, @@ -83,7 +158,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, &data_feed_desc); - int actual_thread_num = thread_num; + actual_thread_num = thread_num; int file_cnt = filelist.size(); PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); @@ -107,11 +182,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); - + PrepareDenseThread(); std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { - worker.reset(new ExecutorThreadWorker); + worker.reset(new AsyncExecutorThreadWorker); } // prepare thread resource here @@ -129,7 +204,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } - + _pull_dense_thread->stop(); root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 6aa59c89dc..67f4e5deee 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -22,6 +22,8 @@ limitations under the License. */ #include // NOLINT #include #include +#include //local_random_engine +#include //local_random_engine #include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_thread_worker.h" @@ -30,6 +32,26 @@ limitations under the License. */ namespace paddle { namespace framework { + +inline double current_realtime() { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + class AsyncExecutor { public: AsyncExecutor(Scope* scope, const platform::Place& place); @@ -40,9 +62,12 @@ class AsyncExecutor { const int thread_num, const std::vector& fetch_names, const bool debug = false); - void ConfigServer() {} - void ConfigWorker() {} - void StartServer() {} + //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); + void ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); + //void ConfigWorker() {} + void StartServer(); + void InitModel(); + void SaveModel(const std::string& path); private: void CreateThreads(ExecutorThreadWorker* worker, @@ -51,11 +76,19 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); - + void PrepareDenseThread(); public: + std::shared_ptr _pslib_ptr; + std::shared_ptr _pull_dense_thread; Scope* root_scope_; platform::Place place_; + + AsyncWorkerParamConfig _param_config; + private: + int actual_thread_num; }; + + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 4e4001e979..19d8818be7 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -31,6 +31,85 @@ limitations under the License. */ namespace paddle { namespace framework { +int DensePullThread::start() { + _running = true; + _t = std::thread(&DensePullThread::run, this); + return 0; +} + +void DensePullThread::run() { + while (_running) { + _pull_dense_status.resize(0); + for (auto& t : _dense_variable_name) { + if (check_update_param(t.first)) { + auto status = pull_dense(t.first); + _pull_dense_status.emplace_back(std::move(status)); + reset_thread_version(t.first); + } + } + if (_pull_dense_status.size() != 0) { + wait_all(); + } + + usleep(_sleep_time_ms * 1000); + } +} +bool DensePullThread::check_update_param(uint64_t table_id) { + { + std::lock_guard lock(_mutex_for_version); + auto& version = _training_versions[table_id]; + _current_version[table_id] = *(std::min_element(version.begin(), version.end())); + } + if (_current_version[table_id] - _last_versions[table_id] < _threshold) { + return false; + } + return true; +} + +void DensePullThread::reset_thread_version(uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _last_versions[table_id] = _current_version[table_id]; +} +std::future DensePullThread::pull_dense(uint64_t table_id) { + auto& regions = _regions[table_id]; + regions.clear(); + auto& variables = _dense_variable_name[table_id]; + regions.resize(variables.size()); + + for (auto i = 0u; i < variables.size(); ++i) { + auto& t = variables[i]; + Variable* var = _root_scope->FindVar(t); + LoDTensor* tensor = var->GetMutable(); + + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + return _ps_client->pull_dense(regions.data(), regions.size(), table_id); +} + +void DensePullThread::wait_all() { + for (auto& t : _pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; + } + } + + if (_pull_dense_fail_times > 20) { + LOG(FATAL) << "pull dense failed times more than 20 times"; + exit(-1); + } + + _pull_dense_status.resize(0); +} + +void DensePullThread::increase_thread_version(int thread_id, uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _training_versions[table_id][thread_id]++; +} + void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); op_names_.clear(); @@ -90,6 +169,11 @@ void ExecutorThreadWorker::SetFetchVarNames( fetch_var_names.end()); } +void ExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { + +} + + void ExecutorThreadWorker::SetDevice() { #if defined _WIN32 || defined __APPLE__ return; @@ -219,5 +303,377 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } +//AsyncExecutor +void AsyncExecutorThreadWorker::TrainFiles() { + SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + TrainOneNetwork(); + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { + _pslib_ptr = pslib_ptr; +} +void AsyncExecutorThreadWorker::SetPullDenseThread(std::shared_ptr dpt) { + _pull_dense_thread = dpt; +} +void AsyncExecutorThreadWorker::TrainOneNetwork() { + PrepareParams(); + + for (auto& op : ops_) { + if (op->Type().find("sgd") != std::string::npos) { + continue; + } + op->Run(*thread_scope_, place_); + } + + UpdateParams(); +} + +void AsyncExecutorThreadWorker::BindingSlotVariableMemory() { + /* + std::vector ins_slot_offset(batch_size + 1, 0); + for (auto i = 1u; i <= batch_size; ++i) { + ins_slot_offset[i] += ins_slot_offset[i - 1] + slot_dim; + } + + std::vector tensor_lod(batch_size + 1, 0); + for (auto i = 1u; i <= batch_size; ++i) { + tensor_lod[i] += tensor_lod[i - 1] + 1; + } + + auto& used_slots = reader->get_use_slot_alias(); + slot_input_vec.resize(used_slots.size() - 1); + for (auto slot_idx = 1u; slot_idx < used_slots.size(); ++slot_idx) { + auto var = slot_input_variable_name[slot_idx]; + + auto v = thread_scope->FindVar(var); + CHECK(v != nullptr) << "var[" << var << "] not found"; + + LoDTensor* tensor = v->GetMutable(); + float* tensor_ptr = tensor->mutable_data({batch_size, slot_dim}, platform::CPUPlace()); + memset(tensor_ptr, 0, sizeof(float) * ins_slot_offset.back()); + + LoD data_lod{tensor_lod}; + tensor->set_lod(data_lod); + + slot_input_vec[slot_idx - 1].reset(tensor); + } + */ +} +void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* pc) { + _param_config = pc; +} + +void AsyncExecutorThreadWorker::PrepareParams() { + int table_id = 0; //TODO + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } + } + _pull_sparse_status.resize(0); + + FillSparse(table_id); +} + +void AsyncExecutorThreadWorker::UpdateParams() { + //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO + for (int i = 0; i < 1; ++i) { + PushSparse(i); + } + //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO + for (int i = 1; i < 2; ++i) { + PushDense(i); + } + int32_t tmp_push_dense_wait_times = _param_config->tmp_push_dense_wait_times; //TODO + int32_t tmp_push_sparse_wait_times = _param_config->tmp_push_sparse_wait_times; //TODO + static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); + static uint32_t push_sparse_wait_times = static_cast(tmp_push_sparse_wait_times); + + if (_push_dense_status.size() >= push_dense_wait_times) { + for (auto& t : _push_dense_status) { + t.wait(); + } + _push_dense_status.resize(0); + } + if (tmp_push_dense_wait_times == -1) { + _push_dense_status.resize(0); + } + + if (_push_sparse_status.size() >= push_sparse_wait_times) { + for (auto& t : _push_sparse_status) { + t.wait(); + } + _push_sparse_status.resize(0); + } + if (tmp_push_sparse_wait_times == -1) { + _push_sparse_status.resize(0); + } + + //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO + int dense_table_id = 1; + _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + //} +} + +void AsyncExecutorThreadWorker::PushDense(int table_id) { + //auto table_id = GlobalConfig::instance().dense_table_id[table_id_index]; TODO + + std::vector regions; + //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; + std::vector variables; + for (auto& t : variables) { + Variable* var = thread_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), regions.size(), table_id); + _push_dense_status.push_back(std::move(status)); + +} + +void AsyncExecutorThreadWorker::PullSparse(int table_id) { + + + auto& features = _features[table_id]; + auto& feature_value = _feature_value[table_id]; + auto fea_dim = _param_config->fea_dim; //TODO + // slot id starts from 1 + features.clear(); + features.resize(0); + features.reserve(MAX_FEASIGN_NUM); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + //todo: current trick - filter feasign=use_slot_mod(bug: datafeed fill use_slot_mod for empty slot) + if (ids[i] == 0u) { + continue; + } + features.push_back(static_cast(ids[i])); + } + } + + check_pull_push_memory(features, feature_value, fea_dim); + + std::vector pull_feature_value; + for (auto i = 0u; i < features.size(); ++i) { + pull_feature_value.push_back(feature_value[i].data()); + } + + auto status = _pslib_ptr->_worker_ptr->pull_sparse( + pull_feature_value.data(), table_id, features.data(), features.size()); + _pull_sparse_status.push_back(std::move(status)); + + //to save time + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + + //binding_slot_embed_with_concat(); TODO + collect_feasign_info(table_id); //TODO +} + +void AsyncExecutorThreadWorker::FillSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; // TODO + auto fea_dim = _param_config->fea_dim; //TODO + auto& features = _features[table_id]; + auto& fea_value = _feature_value[table_id]; + + CHECK(features.size() > 0) << "feature size check failed"; + + auto fea_idx = 0u; + + std::vector init_value(fea_dim); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + + Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[slot_idx - 1]); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = tensor_emb->data(); + + for (auto index = 0u; index < len; ++index){ + //if (_current_train_job.use_cvm_feature()) { + // if (ids[index] == 0u) { + // memcpy(ptr + slot_dim * index, init_value.data(), sizeof(float) * slot_dim); + // continue; + // } + // memcpy(ptr + slot_dim * index, fea_value[fea_idx].data(), sizeof(float) * slot_dim); + // (ptr + slot_dim * index)[0] = log((ptr + slot_dim * index)[0] + 1); + // (ptr + slot_dim * index)[1] = log((ptr + slot_dim * index)[1] + 1) - (ptr + slot_dim * index)[0]; + // fea_idx++; + //} else { + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + fea_idx++; + //} + } + } +} + +void AsyncExecutorThreadWorker::PushSparse(int table_id) { + + auto slot_dim = _param_config->slot_dim; //TODO + auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO + auto& features = _features[table_id]; + //std::vector gradient_var; + //auto& gradient_var = GlobalConfig::instance().input_gradient_variable_name; //TODO + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + uint64_t fea_idx = 0u; + auto& fea_info = _fea_info[table_id]; //TODO + int offset = 0; + //if (!_current_train_job.use_cvm_feature()) { //TODO + offset = 2; + //} + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + if (_slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + continue; + } + Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[slot_idx - 1]); + LoDTensor* g_tensor = g_var->GetMutable(); + //int count = g_tensor->numel(); + float* g = g_tensor->data(); + /* + if (FLAGS_scale_sparse_gradient_with_batch_size) { + Eigen::Map g_mat(g, 1, tensor->numel()); + g_mat *= _batch_size; + } + */ + + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int len = tensor->lod()[0].back(); + //assert(slot_dim * len == count); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx){ + if (ids[id_idx] == 0) { + g += slot_dim; + continue; + } + memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); + push_g[fea_idx][0] = 1.0f; + push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); + g += slot_dim; + fea_idx++; + } + } + assert(fea_idx == features.size()); + CHECK(features.size() > 0); + + std::vector push_g_vec; + for (auto i = 0u; i < features.size(); ++i) { + push_g_vec.push_back(push_g[i].data()); + } + auto status = _pslib_ptr->_worker_ptr->push_sparse( + table_id, features.data(), (const float**)push_g_vec.data(), features.size()); + _push_sparse_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::collect_feasign_info( + int table_id) { + auto& fea_info = _fea_info[table_id]; + auto& feature = _features[table_id]; + fea_info.resize(feature.size()); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + Variable* var = thread_scope_->FindVar(feed_vec[0]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label = tensor->data(); + + int global_index = 0; + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + + int fea_idx = 0; + for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { + for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { + if (ids[fea_idx] == 0u) { + continue; + } + FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; + + fea_info[global_index++] = std::move(info); + } + } + } + CHECK(global_index == feature.size()) << "expect fea info size:" << feature.size() + << " real:" << global_index; +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + std::vector& features, + std::vector>& push_g, + int dim) { + push_g.resize(features.size() + 1); + for (auto& t : push_g) { + t.resize(dim); + } +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + std::vector& features, + std::vector& push_g, + int dim) { + if (features.size() > push_g.size()) { + push_g.reserve(features.size() + 1); + auto size = features.size() - push_g.size() + 1; + for (auto i = 0u; i < size; ++i) { + float* ptr = new float[dim]; + push_g.push_back(ptr); + } + } +} + } // einit_modelnd namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 13ec2442c4..63f383cd47 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -25,16 +25,107 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "pslib.h" namespace paddle { namespace framework { + +const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; + void CreateTensor(Variable* var, proto::VarType::Type var_type); +struct AsyncWorkerParamConfig { + int slot_dim; + int fea_dim; + int32_t tmp_push_dense_wait_times; + int32_t tmp_push_sparse_wait_times; + + std::vector slot_input_vec; //6048slot 6050slot //name + std::vector gradient_var; //6048slot_embed +}; + +struct DensePullThreadParam { + std::shared_ptr ps_client; + int threshold; + int training_thread_num; + Scope* root_scope; + std::map>* dense_params; + int sleep_time_ms = 2; +}; + +class DensePullThread { +public: + DensePullThread(DensePullThreadParam& param) : + _running(false) { + _ps_client = param.ps_client; + _threshold = param.threshold; + _thread_num = param.training_thread_num; + _root_scope = param.root_scope; + _sleep_time_ms = param.sleep_time_ms; + + for (auto& t : *param.dense_params) { + _dense_variable_name[t.first].insert( + _dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); + _training_versions[t.first].resize(_thread_num, 0); + _last_versions[t.first] = 0; + _current_version[t.first] = 0; + } + } + + int start(); + + void stop() { + if (_running) { + _running = false; + _t.join(); + } + } + + void increase_thread_version(int thread_id, uint64_t table_id); + void reset_thread_version(uint64_t table_id); + std::future pull_dense(uint64_t table_id); + void pull_dense2(uint64_t table_id); + void wait_all(); + +private: + void run(); + bool check_update_param(uint64_t table_id); + +private: + std::shared_ptr _ps_client; + int _thread_num; + int _threshold; + int _sleep_time_ms; + Scope* _root_scope; + bool _running; + + std::map _last_versions; + std::map _current_version; + std::mutex _mutex_for_version; + std::map> _training_versions; + std::map> _dense_variable_name; + + std::thread _t; + + std::vector<::std::future> _pull_dense_status; + + std::map> _regions; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; + float _squared_sum_epsilon = 1e-4; + std::mutex _mutex_for_mean_scale; + + float _total_batch_num = 0; +}; class ExecutorThreadWorker { public: ExecutorThreadWorker() : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} - ~ExecutorThreadWorker() {} + virtual ~ExecutorThreadWorker() {} void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); @@ -51,10 +142,13 @@ class ExecutorThreadWorker { // set data feed declared in executor void SetDataFeed(const std::shared_ptr& datafeed); // A multi-thread training function - void TrainFiles(); + virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); - + virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); + virtual void SetPullDenseThread(std::shared_ptr dpt) {}; + virtual void BindingSlotVariableMemory() {}; + virtual void SetParamConfig(AsyncWorkerParamConfig* pc) {}; private: void CreateThreadScope(const framework::ProgramDesc& program); void CreateThreadOperators(const framework::ProgramDesc& program); @@ -77,12 +171,58 @@ class ExecutorThreadWorker { Scope* root_scope_; // a thread scope, father scope is global score which is shared Scope* thread_scope_; - - private: + //private: std::vector fetch_var_names_; std::vector> fetch_values_; bool debug_; }; +class AsyncExecutorThreadWorker: public ExecutorThreadWorker { +public: + AsyncExecutorThreadWorker(){}; + virtual ~AsyncExecutorThreadWorker() {} + void SetPSlibPtr(std::shared_ptr pslib_ptr); + void SetPullDenseThread(std::shared_ptr dpt); + void BindingSlotVariableMemory(); + void SetParamConfig(AsyncWorkerParamConfig* pc); + void TrainFiles(); + void TrainOneNetwork(); + void PrepareParams(); + void UpdateParams(); + void PullSparse(int table_id); + void FillSparse(int table_id); + void PushSparse(int table_id); + void PushDense(int table_id); + + void check_pull_push_memory(std::vector& features, std::vector& push_g, int dim); + void check_pull_push_memory(std::vector& features, std::vector>& push_g, int dim); + void collect_feasign_info(int table_id); +private: + struct FeasignInfo { + uint32_t slot; + uint32_t ins; + int64_t label; + }; + + std::map> _features; + std::map> _fea_info; + std::map>> _feature_value; + std::map>> _feature_push_value; + + std::unordered_map _slot_alias_to_table; //TODO + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; + +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 470e8b0508..63fd06224f 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -47,7 +47,11 @@ void BindAsyncExecutor(py::module* m) { return std::unique_ptr( new framework::AsyncExecutor(scope, place)); })) - .def("run_from_files", &framework::AsyncExecutor::RunFromFile); + .def("run_from_files", &framework::AsyncExecutor::RunFromFile) + .def("config_pslib", &framework::AsyncExecutor::ConfigPslib) + .def("start_server", &framework::AsyncExecutor::StartServer) + .def("init_model", &framework::AsyncExecutor::InitModel) + .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor } // end namespace pybind } // end namespace paddle diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 2664a7301d..2945e6e143 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -149,3 +149,16 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) + + def config_ps(self, dist_desc, host_sign_list, node_num, index): + self.executor.config_pslib(dist_desc, host_sign_list, node_num, index) + + def start_server(self): + self.executor.start_server() + + def init_model(self): + self.executor.init_model() + + def save_model(self, save_path): + self.executor.save_model(save_path) + From e650b42914eca57c8d5a9f743e10788d9cc39828 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 3 Dec 2018 17:47:13 +0800 Subject: [PATCH 093/719] async_thread_trainer & libmct & pslib.cmake --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c929396ff..6fd8dd1dfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,7 +217,7 @@ include(cupti) include(external/gzstream) endif (NOT WIN32) include(external/libmct) -include(external/pslib_brpc) +#include(external/pslib_brpc) include(external/pslib) if(WITH_DISTRIBUTE) @@ -280,6 +280,8 @@ set(EXTERNAL_LIBS zlib ${PYTHON_LIBRARIES} pslib + #pslib_brpc + libmct ) if(WITH_AMD_GPU) From ee4c51a372be97076f06cc7c61f624c3b65b501e Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 3 Dec 2018 18:00:21 +0800 Subject: [PATCH 094/719] refine downpour sgd API with pslib --- python/paddle/fluid/distributed/__init__.py | 0 python/paddle/fluid/distributed/downpour.py | 30 +- python/paddle/fluid/distributed/node.py | 30 +- python/paddle/fluid/distributed/ps_pb2.py | 1491 +++++++++++++++++++ 4 files changed, 1526 insertions(+), 25 deletions(-) create mode 100644 python/paddle/fluid/distributed/__init__.py create mode 100644 python/paddle/fluid/distributed/ps_pb2.py diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 523f686668..551a471495 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -1,34 +1,32 @@ -import paddle.fluid as fluid -import pslib_pb2 as pslib from .node import DownpourServer from .node import DownpourWorker +from ..backward import append_backward +import ps_pb2 as pslib from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from google.protobuf import text_format class DownpourSGD(object): - def __init__(self, optimizer=opt, learning_rate=0.001, window=1): + def __init__(self, learning_rate=0.001, window=1): # todo(guru4elephant): if optimizer is not None, will warning here - self.learning_rate_ = opt.learning_rate + self.learning_rate_ = learning_rate self.window_ = window - def minimize(self, loss, startup_program=None, - parameter_list=None, no_grad_set=None, + def minimize(self, loss, startup_program=None, + parameter_list=None, no_grad_set=None, prefetch_slots=None, prefetch_slots_emb=None): params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) - table_name = fluid_distributed_lookup_table(loss.block.program) + table_name = find_distributed_lookup_table(loss.block.program) server = DownpourServer() - worker = DownpourWorker() - server.add_sparse_table(0, learning_rate, + worker = DownpourWorker(self.window_) + server.add_sparse_table(0, learning_rate, prefetch_slots, prefetch_slots_emb) server.add_dense_table(1, learning_rate, params, grads) - worker.add_sparse_table(0, learning_rate, + worker.add_sparse_table(0, learning_rate, prefetch_slots, prefetch_slots_emb) worker.add_dense_table(1, learning_rate, params, grads) - ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) - ps_param.worker_param.CopyFrom(worker.get_desc()) + #ps_param.worker_param.CopyFrom(worker.get_desc()) worker_skipped_ops = ["lookup_table", "lookup_table_grad"] - - return [solver_desc, parallel_desc] - - + ps_param_str = text_format.MessageToString(ps_param) + return [ps_param_str, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index fc62d7220c..3344bba137 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -1,5 +1,4 @@ -import paddle.fluid as fluid -import pslib_pb2 as pslib +import ps_pb2 as pslib class Server(object): def __init__(self): @@ -13,11 +12,13 @@ class Worker(object): class DownpourServer(Server): def __init__(self): - self.server_ = pslib.ServerParameter().downpour_server_param + #self.server_ = pslib.ServerParameter().downpour_server_param + self.server_ = pslib.ServerParameter() def add_sparse_table(self, table_id, learning_rate, slot_key, slot_value_var, slot_grad_var): - table = self.server_.downpour_table_param.add() + #table = self.server_.downpour_table_param.add() + table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.type = PS_SPARSE_TABLE table.accessor.accessor_class = "DownpourFeatureValueAccessor" @@ -26,12 +27,14 @@ class DownpourServer(Server): def add_dense_table(self, table_id, learning_rate, param_var, grad_var): - table = self.server_.downpour_table_param.add() + #table = self.server_.downpour_table_param.add() + table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.type = PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" table.accessor.sparse_sgd_param.learning_rate = learning_rate - table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) + table.accessor.fea_dim = 1 + #table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) def get_desc(self): return self.server_ @@ -40,19 +43,28 @@ class DownpourServer(Server): class DownpourWorker(Worker): def __init__(self, window): self.window = window - self.worker_ = pslib.WorkerParameter().downpour_worker_param + #self.worker_ = pslib.WorkerParameter().downpour_worker_param + #self.worker_ = pslib.WorkerParameter() + self.worker_ = pslib.DownpourTrainerParameter() + #self.worker_.pull_dense_per_batch = window + #self.worker_.push_dense_per_batch = window + #self.worker_.downpour_worker_param.pull_dense_per_batch = window + #self.worker_.downpour_worker_param.push_dense_per_batch = window self.worker_.pull_dense_per_batch = window self.worker_.push_dense_per_batch = window + print(self.worker_) def add_sparse_table(self, table_id, slot_keys, slot_value_vars, slot_grad_vars): - table = self.worker_.sparse_table.add() + #table = self.worker_.sparse_table.add() + table = self.worker_.downpour_worker_param.sparse_table.add() table.table_id = table_id table.slot.extend(slot_keys) self.worker_.extend([grad.name for grad in slot_grad_vars]) def add_dense_table(self, table_id, param_vars, grad_vars): - table = self.worker_.dense_table.add() + #table = self.worker_.dense_table.add() + table = self.worker_.downpour_worker_param.dense_table.add() table.table_id = table_id table.dense_variable_name.extend([p.name for p in param_vars]) table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py new file mode 100644 index 0000000000..355841aba8 --- /dev/null +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -0,0 +1,1491 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: ps.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='ps.proto', + package='paddle', + syntax='proto2', + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\xe4\x01\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x02 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x03 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x04 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x05 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') +) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +_TABLETYPE = _descriptor.EnumDescriptor( + name='TableType', + full_name='paddle.TableType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_SPARSE_TABLE', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_DENSE_TABLE', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3140, + serialized_end=3192, +) +_sym_db.RegisterEnumDescriptor(_TABLETYPE) + +TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE) +_PSCMDID = _descriptor.EnumDescriptor( + name='PsCmdID', + full_name='paddle.PsCmdID', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_PULL_DENSE_TABLE', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_TABLE', index=1, number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PULL_SPARSE_TABLE', index=2, number=2, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_SPARSE_TABLE', index=3, number=3, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SHRINK_TABLE', index=4, number=4, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ONE_TABLE', index=5, number=5, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ALL_TABLE', index=6, number=6, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ONE_TABLE', index=7, number=7, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ALL_TABLE', index=8, number=8, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ONE_TABLE', index=9, number=9, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ALL_TABLE', index=10, number=10, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_PARAM', index=11, number=11, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_STOP_SERVER', index=12, number=12, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3195, + serialized_end=3512, +) +_sym_db.RegisterEnumDescriptor(_PSCMDID) + +PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID) +PS_SPARSE_TABLE = 0 +PS_DENSE_TABLE = 1 +PS_PULL_DENSE_TABLE = 0 +PS_PUSH_DENSE_TABLE = 1 +PS_PULL_SPARSE_TABLE = 2 +PS_PUSH_SPARSE_TABLE = 3 +PS_SHRINK_TABLE = 4 +PS_SAVE_ONE_TABLE = 5 +PS_SAVE_ALL_TABLE = 6 +PS_LOAD_ONE_TABLE = 7 +PS_LOAD_ALL_TABLE = 8 +PS_CLEAR_ONE_TABLE = 9 +PS_CLEAR_ALL_TABLE = 10 +PS_PUSH_DENSE_PARAM = 11 +PS_STOP_SERVER = 12 + + +_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( + name='FsApiType', + full_name='paddle.FsClientParameter.FsApiType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='HDFS', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='AFS', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3108, + serialized_end=3138, +) +_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) + + +_PSPARAMETER = _descriptor.Descriptor( + name='PSParameter', + full_name='paddle.PSParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='worker_class', full_name='paddle.PSParameter.worker_class', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_class', full_name='paddle.PSParameter.server_class', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='instance_class', full_name='paddle.PSParameter.instance_class', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='worker_param', full_name='paddle.PSParameter.worker_param', index=3, + number=101, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_param', full_name='paddle.PSParameter.server_param', index=4, + number=102, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=5, + number=501, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=21, + serialized_end=249, +) + + +_WORKERPARAMETER = _descriptor.Descriptor( + name='WorkerParameter', + full_name='paddle.WorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_worker_param', full_name='paddle.WorkerParameter.downpour_worker_param', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=251, + serialized_end=332, +) + + +_SERVERPARAMETER = _descriptor.Descriptor( + name='ServerParameter', + full_name='paddle.ServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_server_param', full_name='paddle.ServerParameter.downpour_server_param', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=334, + serialized_end=415, +) + + +_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( + name='DownpourWorkerParameter', + full_name='paddle.DownpourWorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', full_name='paddle.DownpourWorkerParameter.downpour_table_param', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=417, + serialized_end=496, +) + + +_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( + name='DownpourTrainerParameter', + full_name='paddle.DownpourTrainerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0, + number=2, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1, + number=3, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='pull_dense_per_batch', full_name='paddle.DownpourTrainerParameter.pull_dense_per_batch', index=2, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=499, + serialized_end=687, +) + + +_DENSETABLEPARAMETER = _descriptor.Descriptor( + name='DenseTableParameter', + full_name='paddle.DenseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.DenseTableParameter.table_id', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_variable_name', full_name='paddle.DenseTableParameter.dense_variable_name', index=1, + number=2, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_gradient_variable_name', full_name='paddle.DenseTableParameter.dense_gradient_variable_name', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', full_name='paddle.DenseTableParameter.fea_dim', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=689, + serialized_end=812, +) + + +_SPARSETABLEPARAMETER = _descriptor.Descriptor( + name='SparseTableParameter', + full_name='paddle.SparseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.SparseTableParameter.table_id', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='feature_dim', full_name='paddle.SparseTableParameter.feature_dim', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_key', full_name='paddle.SparseTableParameter.slot_key', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_value', full_name='paddle.SparseTableParameter.slot_value', index=3, + number=4, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_gradient', full_name='paddle.SparseTableParameter.slot_gradient', index=4, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=814, + serialized_end=936, +) + + +_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( + name='DownpourServerParameter', + full_name='paddle.DownpourServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', full_name='paddle.DownpourServerParameter.downpour_table_param', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_param', full_name='paddle.DownpourServerParameter.service_param', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=939, + serialized_end=1073, +) + + +_SERVERSERVICEPARAMETER = _descriptor.Descriptor( + name='ServerServiceParameter', + full_name='paddle.ServerServiceParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3, + number=4, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4, + number=5, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1076, + serialized_end=1221, +) + + +_TABLEPARAMETER = _descriptor.Descriptor( + name='TableParameter', + full_name='paddle.TableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.TableParameter.table_id', index=0, + number=1, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_class', full_name='paddle.TableParameter.table_class', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='shared_num', full_name='paddle.TableParameter.shared_num', index=2, + number=3, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='accessor', full_name='paddle.TableParameter.accessor', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='type', full_name='paddle.TableParameter.type', index=4, + number=5, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='compress_in_save', full_name='paddle.TableParameter.compress_in_save', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1224, + serialized_end=1415, +) + + +_TABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='TableAccessorParameter', + full_name='paddle.TableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='accessor_class', full_name='paddle.TableAccessorParameter.accessor_class', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_sgd_param', full_name='paddle.TableAccessorParameter.sparse_sgd_param', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_sgd_param', full_name='paddle.TableAccessorParameter.dense_sgd_param', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', full_name='paddle.TableAccessorParameter.fea_dim', index=3, + number=4, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_dim', full_name='paddle.TableAccessorParameter.embedx_dim', index=4, + number=5, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_threshold', full_name='paddle.TableAccessorParameter.embedx_threshold', index=5, + number=6, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='downpour_accessor_param', full_name='paddle.TableAccessorParameter.downpour_accessor_param', index=6, + number=7, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_accessor_save_param', full_name='paddle.TableAccessorParameter.table_accessor_save_param', index=7, + number=8, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1418, + serialized_end=1787, +) + + +_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='DownpourTableAccessorParameter', + full_name='paddle.DownpourTableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='nonclk_coeff', full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', index=0, + number=1, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='click_coeff', full_name='paddle.DownpourTableAccessorParameter.click_coeff', index=1, + number=2, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='base_threshold', full_name='paddle.DownpourTableAccessorParameter.base_threshold', index=2, + number=3, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_threshold', full_name='paddle.DownpourTableAccessorParameter.delta_threshold', index=3, + number=4, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_keep_days', full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', index=4, + number=5, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='show_click_decay_rate', full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', index=5, + number=6, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delete_threshold', full_name='paddle.DownpourTableAccessorParameter.delete_threshold', index=6, + number=7, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1790, + serialized_end=1996, +) + + +_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( + name='TableAccessorSaveParameter', + full_name='paddle.TableAccessorSaveParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='param', full_name='paddle.TableAccessorSaveParameter.param', index=0, + number=1, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='converter', full_name='paddle.TableAccessorSaveParameter.converter', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='deconverter', full_name='paddle.TableAccessorSaveParameter.deconverter', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1998, + serialized_end=2081, +) + + +_PSREQUESTMESSAGE = _descriptor.Descriptor( + name='PsRequestMessage', + full_name='paddle.PsRequestMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='cmd_id', full_name='paddle.PsRequestMessage.cmd_id', index=0, + number=1, type=13, cpp_type=3, label=2, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_id', full_name='paddle.PsRequestMessage.table_id', index=1, + number=2, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='params', full_name='paddle.PsRequestMessage.params', index=2, + number=3, type=12, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_id', full_name='paddle.PsRequestMessage.client_id', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', full_name='paddle.PsRequestMessage.data', index=4, + number=5, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2083, + serialized_end=2184, +) + + +_SPARSESGDRULEPARAMETER = _descriptor.Descriptor( + name='SparseSGDRuleParameter', + full_name='paddle.SparseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', full_name='paddle.SparseSGDRuleParameter.learning_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_g2sum', full_name='paddle.SparseSGDRuleParameter.initial_g2sum', index=1, + number=2, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_range', full_name='paddle.SparseSGDRuleParameter.initial_range', index=2, + number=3, type=1, cpp_type=5, label=1, + has_default_value=True, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='weight_bounds', full_name='paddle.SparseSGDRuleParameter.weight_bounds', index=3, + number=4, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2186, + serialized_end=2305, +) + + +_DENSESGDRULEPARAMETER = _descriptor.Descriptor( + name='DenseSGDRuleParameter', + full_name='paddle.DenseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='paddle.DenseSGDRuleParameter.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='adam', full_name='paddle.DenseSGDRuleParameter.adam', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='naive', full_name='paddle.DenseSGDRuleParameter.naive', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='summary', full_name='paddle.DenseSGDRuleParameter.summary', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='moving_average', full_name='paddle.DenseSGDRuleParameter.moving_average', index=4, + number=5, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2308, + serialized_end=2533, +) + + +_ADAMSGDPARAMETER = _descriptor.Descriptor( + name='AdamSGDParameter', + full_name='paddle.AdamSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', full_name='paddle.AdamSGDParameter.learning_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', full_name='paddle.AdamSGDParameter.avg_decay_rate', index=1, + number=2, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_decay_rate', full_name='paddle.AdamSGDParameter.ada_decay_rate', index=2, + number=3, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_epsilon', full_name='paddle.AdamSGDParameter.ada_epsilon', index=3, + number=4, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mom_decay_rate', full_name='paddle.AdamSGDParameter.mom_decay_rate', index=4, + number=5, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2536, + serialized_end=2670, +) + + +_NAIVESGDPARAMETER = _descriptor.Descriptor( + name='NaiveSGDParameter', + full_name='paddle.NaiveSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', full_name='paddle.NaiveSGDParameter.learning_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', full_name='paddle.NaiveSGDParameter.avg_decay_rate', index=1, + number=2, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2672, + serialized_end=2738, +) + + +_SUMMARYSGDPARAMETER = _descriptor.Descriptor( + name='SummarySGDParameter', + full_name='paddle.SummarySGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='summary_decay_rate', full_name='paddle.SummarySGDParameter.summary_decay_rate', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=True, default_value=float(0.999999), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2740, + serialized_end=2799, +) + + +_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( + name='MovingAverageRuleParameter', + full_name='paddle.MovingAverageRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='momentum', full_name='paddle.MovingAverageRuleParameter.momentum', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2801, + serialized_end=2847, +) + + +_PSRESPONSEMESSAGE = _descriptor.Descriptor( + name='PsResponseMessage', + full_name='paddle.PsResponseMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='err_code', full_name='paddle.PsResponseMessage.err_code', index=0, + number=1, type=5, cpp_type=1, label=2, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='err_msg', full_name='paddle.PsResponseMessage.err_msg', index=1, + number=2, type=9, cpp_type=9, label=2, + has_default_value=True, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', full_name='paddle.PsResponseMessage.data', index=2, + number=3, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2849, + serialized_end=2922, +) + + +_FSCLIENTPARAMETER = _descriptor.Descriptor( + name='FsClientParameter', + full_name='paddle.FsClientParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='fs_type', full_name='paddle.FsClientParameter.fs_type', index=0, + number=1, type=14, cpp_type=8, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uri', full_name='paddle.FsClientParameter.uri', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='user', full_name='paddle.FsClientParameter.user', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='passwd', full_name='paddle.FsClientParameter.passwd', index=3, + number=4, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='buffer_size', full_name='paddle.FsClientParameter.buffer_size', index=4, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='hadoop_bin', full_name='paddle.FsClientParameter.hadoop_bin', index=5, + number=51, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='afs_conf', full_name='paddle.FsClientParameter.afs_conf', index=6, + number=101, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _FSCLIENTPARAMETER_FSAPITYPE, + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=2925, + serialized_end=3138, +) + +_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER +_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER +_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER +_WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER +_SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER +_DOWNPOURWORKERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name['dense_table'].message_type = _DENSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name['sparse_table'].message_type = _SPARSETABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name['service_param'].message_type = _SERVERSERVICEPARAMETER +_TABLEPARAMETER.fields_by_name['accessor'].message_type = _TABLEACCESSORPARAMETER +_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE +_TABLEACCESSORPARAMETER.fields_by_name['sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name['dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name['downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name['table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['summary'].message_type = _SUMMARYSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER +_FSCLIENTPARAMETER.fields_by_name['fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE +_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER +DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER +DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER +DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER +DESCRIPTOR.message_types_by_name['DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER +DESCRIPTOR.message_types_by_name['DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER +DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER +DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER +DESCRIPTOR.message_types_by_name['DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER +DESCRIPTOR.message_types_by_name['ServerServiceParameter'] = _SERVERSERVICEPARAMETER +DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER +DESCRIPTOR.message_types_by_name['TableAccessorParameter'] = _TABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name['DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name['TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER +DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE +DESCRIPTOR.message_types_by_name['SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name['DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER +DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER +DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER +DESCRIPTOR.message_types_by_name['MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER +DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE +DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER +DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE +DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID + +PSParameter = _reflection.GeneratedProtocolMessageType('PSParameter', (_message.Message,), dict( + DESCRIPTOR = _PSPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PSParameter) + )) +_sym_db.RegisterMessage(PSParameter) + +WorkerParameter = _reflection.GeneratedProtocolMessageType('WorkerParameter', (_message.Message,), dict( + DESCRIPTOR = _WORKERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) + )) +_sym_db.RegisterMessage(WorkerParameter) + +ServerParameter = _reflection.GeneratedProtocolMessageType('ServerParameter', (_message.Message,), dict( + DESCRIPTOR = _SERVERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerParameter) + )) +_sym_db.RegisterMessage(ServerParameter) + +DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType('DownpourWorkerParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURWORKERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) + )) +_sym_db.RegisterMessage(DownpourWorkerParameter) + +DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType('DownpourTrainerParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURTRAINERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) + )) +_sym_db.RegisterMessage(DownpourTrainerParameter) + +DenseTableParameter = _reflection.GeneratedProtocolMessageType('DenseTableParameter', (_message.Message,), dict( + DESCRIPTOR = _DENSETABLEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) + )) +_sym_db.RegisterMessage(DenseTableParameter) + +SparseTableParameter = _reflection.GeneratedProtocolMessageType('SparseTableParameter', (_message.Message,), dict( + DESCRIPTOR = _SPARSETABLEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) + )) +_sym_db.RegisterMessage(SparseTableParameter) + +DownpourServerParameter = _reflection.GeneratedProtocolMessageType('DownpourServerParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURSERVERPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) + )) +_sym_db.RegisterMessage(DownpourServerParameter) + +ServerServiceParameter = _reflection.GeneratedProtocolMessageType('ServerServiceParameter', (_message.Message,), dict( + DESCRIPTOR = _SERVERSERVICEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) + )) +_sym_db.RegisterMessage(ServerServiceParameter) + +TableParameter = _reflection.GeneratedProtocolMessageType('TableParameter', (_message.Message,), dict( + DESCRIPTOR = _TABLEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableParameter) + )) +_sym_db.RegisterMessage(TableParameter) + +TableAccessorParameter = _reflection.GeneratedProtocolMessageType('TableAccessorParameter', (_message.Message,), dict( + DESCRIPTOR = _TABLEACCESSORPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) + )) +_sym_db.RegisterMessage(TableAccessorParameter) + +DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType('DownpourTableAccessorParameter', (_message.Message,), dict( + DESCRIPTOR = _DOWNPOURTABLEACCESSORPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) + )) +_sym_db.RegisterMessage(DownpourTableAccessorParameter) + +TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType('TableAccessorSaveParameter', (_message.Message,), dict( + DESCRIPTOR = _TABLEACCESSORSAVEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) + )) +_sym_db.RegisterMessage(TableAccessorSaveParameter) + +PsRequestMessage = _reflection.GeneratedProtocolMessageType('PsRequestMessage', (_message.Message,), dict( + DESCRIPTOR = _PSREQUESTMESSAGE, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) + )) +_sym_db.RegisterMessage(PsRequestMessage) + +SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseSGDRuleParameter', (_message.Message,), dict( + DESCRIPTOR = _SPARSESGDRULEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) + )) +_sym_db.RegisterMessage(SparseSGDRuleParameter) + +DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('DenseSGDRuleParameter', (_message.Message,), dict( + DESCRIPTOR = _DENSESGDRULEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) + )) +_sym_db.RegisterMessage(DenseSGDRuleParameter) + +AdamSGDParameter = _reflection.GeneratedProtocolMessageType('AdamSGDParameter', (_message.Message,), dict( + DESCRIPTOR = _ADAMSGDPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) + )) +_sym_db.RegisterMessage(AdamSGDParameter) + +NaiveSGDParameter = _reflection.GeneratedProtocolMessageType('NaiveSGDParameter', (_message.Message,), dict( + DESCRIPTOR = _NAIVESGDPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) + )) +_sym_db.RegisterMessage(NaiveSGDParameter) + +SummarySGDParameter = _reflection.GeneratedProtocolMessageType('SummarySGDParameter', (_message.Message,), dict( + DESCRIPTOR = _SUMMARYSGDPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) + )) +_sym_db.RegisterMessage(SummarySGDParameter) + +MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType('MovingAverageRuleParameter', (_message.Message,), dict( + DESCRIPTOR = _MOVINGAVERAGERULEPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) + )) +_sym_db.RegisterMessage(MovingAverageRuleParameter) + +PsResponseMessage = _reflection.GeneratedProtocolMessageType('PsResponseMessage', (_message.Message,), dict( + DESCRIPTOR = _PSRESPONSEMESSAGE, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) + )) +_sym_db.RegisterMessage(PsResponseMessage) + +FsClientParameter = _reflection.GeneratedProtocolMessageType('FsClientParameter', (_message.Message,), dict( + DESCRIPTOR = _FSCLIENTPARAMETER, + __module__ = 'ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) + )) +_sym_db.RegisterMessage(FsClientParameter) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\200\001\001')) +# @@protoc_insertion_point(module_scope) From c47c451a007f33078bfb8f38be4a6cd50922f361 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 3 Dec 2018 11:45:53 +0000 Subject: [PATCH 095/719] fix bug --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../details/computation_op_handle.cc | 2 + .../details/eager_deletion_op_handle.cc | 23 ++-- .../details/eager_deletion_op_handle.h | 8 +- .../framework/details/eager_deletion_pass.cc | 81 ++++++------ .../fluid/framework/details/op_graph_view.h | 29 +++- .../framework/details/reference_count_pass.cc | 125 ++++++++++++++++-- .../scope_buffered_ssa_graph_executor.cc | 21 ++- .../scope_buffered_ssa_graph_executor.h | 2 + paddle/fluid/framework/executor.cc | 104 +++++++++++---- paddle/fluid/framework/executor.h | 51 ++----- paddle/fluid/framework/garbage_collector.h | 44 +++--- paddle/fluid/framework/operator.cc | 2 + paddle/fluid/framework/parallel_executor.cc | 13 +- paddle/fluid/framework/scope.cc | 6 + paddle/fluid/framework/scope.h | 1 + paddle/fluid/framework/tensor.h | 2 +- .../fluid/operators/controlflow/while_op.cc | 44 +++++- paddle/fluid/operators/reader/ctr_reader.h | 12 +- paddle/fluid/platform/device_context.h | 10 +- .../fluid/platform/stream_callback_manager.cc | 67 +++++----- .../fluid/platform/stream_callback_manager.h | 20 +-- paddle/fluid/pybind/tensor_py.h | 12 +- python/paddle/fluid/__init__.py | 5 +- 24 files changed, 458 insertions(+), 228 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 8cf97d667d..8049f5d3f7 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -35,7 +35,7 @@ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_e cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) -cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7beb8c8de9..2bf43fd4e0 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -31,6 +31,8 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); + VLOG(10) << "Run Op" << Name(); + auto run_func = [this]() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index cd26203376..41f616035d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/cuda_device_guard.h" namespace paddle { namespace framework { @@ -23,28 +24,32 @@ namespace details { EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, - const std::vector &var_names, GarbageCollector *gc, - AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { + const std::unordered_set &var_names, + GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), + scope_(scope), + var_names_(var_names), + gc_(gc), + ref_cnts_(ref_cnts) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { dev_ctx_ = static_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast *>(gc_)) { - platform::SetDeviceId(boost::get(place).device); + platform::CUDADeviceGuard guard( + boost::get(place).device); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + PADDLE_ENFORCE_NOT_NULL(event_); } } #endif - - for (auto &name : var_names) AddVar(name); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { #ifdef PADDLE_WITH_CUDA if (event_) { auto gpu_place = boost::get(dev_ctx_->GetPlace()); - platform::SetDeviceId(gpu_place.device); + platform::CUDADeviceGuard guard(gpu_place.device); PADDLE_ENFORCE(cudaEventDestroy(event_)); } #endif @@ -52,10 +57,6 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } -void EagerDeletionOpHandle::AddVar(const std::string &name) { - var_names_.insert(name); -} - void EagerDeletionOpHandle::RunImpl() { auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); std::vector tensors; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index 8254f21bdf..d8de59cc4d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -25,13 +25,11 @@ class Scope; namespace details { -class EagerDeletionPass; - class EagerDeletionOpHandle : public OpHandleBase { public: EagerDeletionOpHandle(ir::Node *node, const Scope *scope, const platform::Place &place, - const std::vector &var_names, + const std::unordered_set &var_names, GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts); @@ -45,8 +43,6 @@ class EagerDeletionOpHandle : public OpHandleBase { private: void ClearTensors(const std::vector &tensors); - void AddVar(const std::string &name); - const Scope *scope_; std::unordered_set var_names_; GarbageCollector *gc_; // not own @@ -55,8 +51,6 @@ class EagerDeletionOpHandle : public OpHandleBase { platform::CUDADeviceContext *dev_ctx_{nullptr}; cudaEvent_t event_{nullptr}; #endif - - friend class EagerDeletionPass; }; } // namespace details diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index f877c2881c..3a1b37e533 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -26,62 +26,61 @@ namespace paddle { namespace framework { namespace details { -static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, - ir::Graph *graph) { - auto it = std::find_if( - in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { - return dynamic_cast(var) != nullptr; - }); - - if (it != in->Outputs().end()) { - out->AddInput(*it); - } else { - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dep_var); - in->AddOutput(dep_var); - out->AddInput(dep_var); - } - - // Add leaf node to eager_deletion_node - if (out->Outputs().empty()) { - auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dummy_leaf); - out->AddOutput(dummy_leaf); - } -} - std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { - auto &vars = graph->Get(kGraphVars); + const auto &vars = graph->Get(kGraphVars); auto &ref_cnts = Get>(kCurReferenceCount); - auto &last_live_ops = Get>(kLastLiveOpsOfVars); + const auto &last_live_ops = + Get>(kLastLiveOpsOfVars); auto &gcs = Get(kGarbageCollector); ref_cnts = std::vector(vars.size()); - std::unordered_map op_map; + std::unordered_map> + op_vars_map; + for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; - for (ComputationOpHandle *op : var_ops_pair.second) { - auto it = op_map.find(op); - if (it != op_map.end()) { - it->second->AddVar(var_name); - } else { - auto *eager_deletion_node = graph->CreateEmptyNode( - "eager_deletion", ir::Node::Type::kOperation); - auto *eager_deletion_op = new EagerDeletionOpHandle( - eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name}, - gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()])); - AddDependencyBetween(op, eager_deletion_op, graph.get()); - op_map[op] = eager_deletion_op; - } + for (auto *op : var_ops_pair.second) { + op_vars_map[op].insert(var_name); } } } - VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)"; + + for (auto &pair : op_vars_map) { + auto *op = pair.first; + auto &var_names = pair.second; + + auto *eager_deletion_node = + graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); + auto *eager_deletion_op = new EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), + std::move(var_names), gcs[op->GetScopeIdx()].get(), + &(ref_cnts[op->GetScopeIdx()])); + + auto it = std::find_if( + op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != op->Outputs().end()) { + eager_deletion_op->AddInput(*it); + } else { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + op->AddOutput(dep_var); + eager_deletion_op->AddInput(dep_var); + } + + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + eager_deletion_op->AddOutput(dummy_leaf); + } + + VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; return graph; } diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h index afb3e8e594..77aa02eba5 100644 --- a/paddle/fluid/framework/details/op_graph_view.h +++ b/paddle/fluid/framework/details/op_graph_view.h @@ -14,7 +14,7 @@ #pragma once -#include +#include #include #include #include @@ -34,6 +34,11 @@ class OpGraphView { bool HasOp(OpHandleBase *op) const; + // Use a visitor to visit all pending ops of op + // Stop when callback returns false + template + bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const; + private: void Build(const std::vector &ops); void EnforceHasOp(OpHandleBase *op) const; @@ -44,6 +49,28 @@ class OpGraphView { pending_ops_; }; +template +bool OpGraphView::VisitAllPendingOps(OpHandleBase *op, + Callback &&callback) const { + EnforceHasOp(op); + std::unordered_set visited; + std::queue q; + q.push(op); + do { + op = q.front(); + q.pop(); + for (auto &pending_op : pending_ops_.at(op)) { + if (visited.count(pending_op) == 0) { + visited.insert(pending_op); + if (!callback(pending_op)) { + return false; + } + } + } + } while (!q.empty()); + return true; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index f094c7afa9..2320d3926a 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -14,11 +14,13 @@ #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/op_graph_view.h" #include "paddle/fluid/framework/details/reference_count_pass.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -27,6 +29,89 @@ namespace paddle { namespace framework { namespace details { +struct OpConnectionDetector { + public: + enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; + + explicit OpConnectionDetector(const std::vector &all_ops) + : graph_(all_ops) {} + + template + std::unordered_set MaxNoDepOps( + const OpSet &op_set) { + using KeyType = typename OpSet::key_type; + static_assert( + std::is_base_of::type>::value, + "Key type of OpSet must be or derived of OpHandleBase"); + + std::vector ops(op_set.begin(), op_set.end()); + std::unordered_set ret; + auto rels = GetRelations(ops); + auto not_before = [](RelationShip r) { return r != kBefore; }; + for (size_t i = 0; i < rels.size(); ++i) { + if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) { + ret.insert(static_cast(ops[i])); + } + } + return ret; + } + + private: + std::vector> GetRelations( + const std::vector ops) { + std::unordered_map op_to_idx; + for (size_t i = 0; i < ops.size(); ++i) { + PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); + op_to_idx[ops[i]] = i; + } + + PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops"); + + std::vector> ret(ops.size()); + for (auto &e : ret) { + e.assign(ops.size(), kSame); + } + + size_t found_num = ops.size(); + size_t total_num = ops.size() * ops.size(); + auto visitor = [&](OpHandleBase *op, size_t i) { + auto it = op_to_idx.find(op); + if (it != op_to_idx.end()) { + size_t j = it->second; + if (ret[i][j] != kSame) { + ret[i][j] = kBefore; + ret[j][i] = kAfter; + found_num += 2; + if (found_num == total_num) { + return false; + } + } + } + return true; + }; + + for (size_t i = 0; i < ops.size(); ++i) { + auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); }; + if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) { + break; + } + } + + for (size_t i = 0; i < ops.size(); ++i) { + for (size_t j = i + 1; j < ops.size(); ++j) { + if (ret[i][j] != kSame) continue; + ret[i][j] = kNoDeps; + ret[j][i] = kNoDeps; + } + } + + return ret; + } + + const OpGraphView graph_; +}; + static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( OpHandleBase *op, size_t scope_idx) { std::queue q; @@ -59,9 +144,15 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( last_live_ops_of_vars = std::vector(vars.size()); ref_cnts = std::vector(vars.size()); + OpConnectionDetector detector(ir::FilterByNodeWrapper(*graph)); + for (size_t i = 0; i < vars.size(); ++i) { for (auto &name_var_pair : vars[i]) { - if (name_var_pair.second.empty()) continue; + if (name_var_pair.second.empty()) { + continue; + } + + const std::string &var_name = name_var_pair.first; auto *last_ver_var = name_var_pair.second.back(); VarDesc *var_desc = nullptr; @@ -83,30 +174,46 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( } std::unordered_set last_live_op; - auto add_last_live_op = [&](OpHandleBase *op) { + auto add_last_live_op = [&](OpHandleBase *op) -> bool { auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); if (compute_op) { last_live_op.insert(compute_op); + return true; + } else { + return false; } }; - const std::string &var_name = name_var_pair.first; + + bool can_delete = false; auto &pending_ops = last_ver_var->PendingOps(); if (pending_ops.empty()) { auto *generated_op = last_ver_var->GeneratedOp(); - if (generated_op) { - ref_cnts[i].emplace(var_name, 1); - add_last_live_op(generated_op); + if (generated_op && add_last_live_op(generated_op)) { + can_delete = true; } } else { - ref_cnts[i].emplace(var_name, pending_ops.size()); + can_delete = true; for (auto *pending_op : pending_ops) { - add_last_live_op(pending_op); + if (!add_last_live_op(pending_op)) { + can_delete = false; + break; + } } } - last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + if (can_delete) { + size_t original_size = last_live_op.size(); + last_live_op = detector.MaxNoDepOps(last_live_op); + if (last_live_op.size() != original_size) { + VLOG(10) << "Shrink last living op number of " << var_name << " from " + << original_size << " to " << last_live_op.size(); + } + ref_cnts[i].emplace(var_name, last_live_op.size()); + last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + } } } + return graph; } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index f1bf6542a3..0cc3ac8bfb 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -36,6 +36,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( } } +void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { + if (gc_) { + for (auto &gc : *gc_) { + gc->Wait(); + gc->Reset(); + } + } +} + FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { @@ -74,19 +83,19 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (size_t i = 0; i < places_.size(); ++i) { - platform::DeviceContextPool::Instance().Get(places_[i])->Wait(); - if (gc_) { - (*gc_)[i]->Wait(); - (*gc_)[i]->Reset(); - } + for (auto &p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); } + WaitAllGarbageCollectors(); for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } + } else { + WaitAllGarbageCollectors(); } + if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index ce3061d6e6..4d52183a20 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -50,6 +50,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; private: + void WaitAllGarbageCollectors(); + size_t drop_scope_counter_{0}; ExecutionStrategy strategy_; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 96132a2c18..02d1e4114e 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -37,11 +37,49 @@ namespace { int kProgramId = -1; } // namespace +static std::unordered_map GetNonPersistableReferenceCounts( + const BlockDesc& block, const std::vector& skip_var_list) { + std::unordered_map ref_cnts; + std::unordered_set skip_vars(skip_var_list.begin(), + skip_var_list.end()); + + auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + if (skip_vars.count(name)) continue; + auto* var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) continue; + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR && + type != proto::VarType::SELECTED_ROWS && + type != proto::VarType::LOD_TENSOR_ARRAY) { + continue; + } + + auto it = ref_cnts.find(name); + if (it != ref_cnts.end()) { + ++it->second; + } else { + ref_cnts[name] = 1; + } + } + } + }; + + for (auto op_desc : block.AllOps()) { + update_ref_cnts(op_desc, op_desc->Inputs()); + update_ref_cnts(op_desc, op_desc->Outputs()); + } + return ref_cnts; +} + ExecutorPrepareContext::ExecutorPrepareContext( - const framework::ProgramDesc& prog, size_t block_id) + const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars) : prog_(prog), block_id_(block_id) { if (GetEagerDeletionThreshold() >= 0) { - ref_cnts_ = GetNonPersistableReferenceCount(prog_, block_id_); + ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), + skip_ref_cnt_vars); } } @@ -49,10 +87,9 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } -template -static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, - GarbageCollector* gc, - RefCntMap* ref_cnts) { +static void DeleteUnusedTensors( + const Scope& scope, const OperatorBase* op, GarbageCollector* gc, + std::unordered_map* ref_cnts) { std::unordered_set erase_tensors; auto handler = [&](const VariableNameMap& name_map) { @@ -60,7 +97,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, for (auto& name : name_pair.second) { auto it = ref_cnts->find(name); if (it == ref_cnts->end()) continue; - if ((it->second)-- == 1) { + if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { VLOG(10) << "Erase tensor \'" << name << "\'"; @@ -69,6 +106,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, } else if (var->IsType()) { erase_tensors.insert( var->GetMutable()->mutable_value()); + } else if (var->IsType()) { + auto* lod_tensor_arr = var->GetMutable(); + for (auto& t : *lod_tensor_arr) { + erase_tensors.insert(&t); + } } } } @@ -351,9 +393,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } std::unique_ptr Executor::Prepare( - const ProgramDesc& program, int block_id) { + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars) { std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id)); + new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -364,16 +407,28 @@ std::unique_ptr Executor::Prepare( } std::vector> Executor::Prepare( - const ProgramDesc& program, const std::vector& block_ids) { + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars) { + PADDLE_ENFORCE( + skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), + "skip_ref_cnt_vars should be either empty or equals to block number %d", + block_ids.size()); std::vector> result; + size_t idx = 0; for (auto& bid : block_ids) { - auto* ctx = new ExecutorPrepareContext(program, bid); + ExecutorPrepareContext* ctx; + if (skip_ref_cnt_vars.empty()) { + ctx = new ExecutorPrepareContext(program, bid); + } else { + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } result.push_back(std::shared_ptr(ctx)); + ++idx; } return result; } @@ -392,18 +447,18 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr> gc; - // WhileOp would set keep_kids to true, - // because WhileGradOp needs the scopes created in WhileOp. - // Perhaps, we should not perform eager deletion in WhileOp - // The scopes and variables created by WhileOp would be deleted - // in WhileGradOp. - if (max_memory_size >= 0 && !keep_kids) { + if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { - gc.reset(new DefaultStreamGarbageCollector( - boost::get(place_), max_memory_size)); - } else { + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place_), max_memory_size)); + } else { + gc.reset(new DefaultStreamGarbageCollector( + boost::get(place_), max_memory_size)); + } + } else if (platform::is_cpu_place(place_)) { #endif gc.reset(new CPUGarbageCollector( boost::get(place_), max_memory_size)); @@ -415,17 +470,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, for (auto& op : ctx->ops_) { op->Run(*local_scope, place_); - if (gc != nullptr) { + if (gc) { DeleteUnusedTensors(*local_scope, op.get(), gc.get(), &(ctx->cur_ref_cnts_)); } } - if (gc != nullptr) { - gc->Wait(); - } else { - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - } + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + if (gc) gc->Wait(); if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 36b36d49c2..f00d4314b6 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -28,42 +28,11 @@ namespace paddle { namespace framework { extern void InitializeVariable(Variable* var, proto::VarType::Type var_type); -template -std::unordered_map GetNonPersistableReferenceCount( - const ProgramDesc& prog, size_t block_id) { - auto& block = prog.Block(block_id); - std::unordered_map ref_cnts; - - auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - auto* var_desc = block.FindVar(name); - if (var_desc == nullptr || var_desc->Persistable()) continue; - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR && - type != proto::VarType::SELECTED_ROWS) { - continue; - } - - auto it = ref_cnts.find(name); - if (it != ref_cnts.end()) { - ++it->second; - } else { - ref_cnts[name] = 1; - } - } - } - }; - - for (auto op_desc : block.AllOps()) { - update_ref_cnts(op_desc, op_desc->Inputs()); - update_ref_cnts(op_desc, op_desc->Outputs()); - } - return ref_cnts; -} - struct ExecutorPrepareContext { - ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); + ~ExecutorPrepareContext(); void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } @@ -72,8 +41,8 @@ struct ExecutorPrepareContext { size_t block_id_; std::vector> ops_; - std::unordered_map ref_cnts_; - std::unordered_map cur_ref_cnts_; + std::unordered_map ref_cnts_; + std::unordered_map cur_ref_cnts_; }; class Executor { @@ -109,10 +78,14 @@ class Executor { const std::string& fetch_holder_name = "fetch"); static std::unique_ptr Prepare( - const ProgramDesc& program, int block_id); + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); static std::vector> Prepare( - const ProgramDesc& program, const std::vector& block_ids); + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars = + std::vector>()); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index cbe8f606ef..1382e0d461 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -19,6 +19,9 @@ #include #include #include // NOLINT +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -36,6 +39,11 @@ class GarbageCollector { virtual ~GarbageCollector() {} + size_t NumOfGarbages() const { + std::lock_guard guard(mutex_); + return garbages_->size(); + } + void Reset() { std::lock_guard guard(mutex_); garbages_.reset(new std::deque()); @@ -49,7 +57,7 @@ class GarbageCollector { template void Add(const Container &objs, Callback &&callback) { - std::shared_ptr> clear_deque; + std::deque *clear_deque = nullptr; { std::lock_guard guard(mutex_); for (auto *obj : objs) { @@ -58,7 +66,7 @@ class GarbageCollector { } if (cur_memory_size_ >= max_memory_size_) { cur_memory_size_ = 0; - clear_deque = garbages_; + clear_deque = garbages_.release(); garbages_.reset(new std::deque()); } } @@ -67,6 +75,7 @@ class GarbageCollector { callback(); ClearCallback([clear_deque]() { for (auto *obj : *clear_deque) obj->clear(); + delete clear_deque; }); } } @@ -77,7 +86,7 @@ class GarbageCollector { virtual void ClearCallback(const std::function &callback) = 0; platform::DeviceContext *dev_ctx_; - std::shared_ptr> garbages_; + std::unique_ptr> garbages_; mutable std::mutex mutex_; const size_t max_memory_size_; size_t cur_memory_size_ = 0; @@ -96,6 +105,19 @@ class CPUGarbageCollector : public GarbageCollector { }; #ifdef PADDLE_WITH_CUDA +template +class UnsafeFastGPUGarbageCollector : public GarbageCollector { + public: + UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + + protected: + void ClearCallback(const std::function &callback) override { + callback(); + } +}; + template class DefaultStreamGarbageCollector : public GarbageCollector { public: @@ -109,7 +131,7 @@ class DefaultStreamGarbageCollector : public GarbageCollector { } void Wait() const override { - static_cast(this->dev_ctx_) + static_cast(this->dev_ctx_) ->WaitStreamCallback(); } @@ -126,31 +148,23 @@ class StreamGarbageCollector : public GarbageCollector { StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - platform::SetDeviceId(place.device); + platform::CUDADeviceGuard guard(place.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } ~StreamGarbageCollector() { auto place = boost::get(this->dev_ctx_->GetPlace()); - platform::SetDeviceId(place.device); + platform::CUDADeviceGuard guard(place.device); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } - void Wait() const override { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - std::lock_guard guard(this->mutex_); - callback_manager_->Wait(); - } + void Wait() const override { callback_manager_->Wait(); } cudaStream_t stream() const { return stream_; } protected: - // ClearCallback and Wait()/Reset() cannot be call in multiple threads - // But it is not important, because they would not be called in multiple - // threads - // either in Executor or ParallelExecutor void ClearCallback(const std::function &callback) override { callback_manager_->AddCallback(callback); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8bfdf38912..a5f714fc89 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -873,6 +873,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { + PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", + ipt_name, DebugString()); int tmp = static_cast(ToDataType(t->type())); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e71f93beef..3d466e44a1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -158,8 +158,13 @@ ParallelExecutor::ParallelExecutor( auto &place = member_->places_[i]; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { - member_->gcs_.emplace_back(new StreamGarbageCollector( - boost::get(place), max_memory_size)); + if (IsFastEagerDeletionModeEnabled()) { + member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); + } else { + member_->gcs_.emplace_back(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; } else if (platform::is_cpu_place(place)) { #endif @@ -181,8 +186,8 @@ ParallelExecutor::ParallelExecutor( &(member_->rt_ref_cnts_)); ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, &last_live_ops_of_vars); - VLOG(10) << "ReferenceCountPass Applied"; graph = ref_cnt_pass->Apply(std::move(graph)); + VLOG(10) << "ReferenceCountPass Applied"; auto eager_deletion_pass = ir::PassRegistry::Instance().Get("eager_deletion_pass"); @@ -194,6 +199,8 @@ ParallelExecutor::ParallelExecutor( &last_live_ops_of_vars); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_)); } // Step 3. Create vars in each scope. Passes may also create new vars. diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7cc..cb3b6cdc3e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,6 +38,10 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); +DEFINE_bool(fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and // the mutex will cause serious performance issue. @@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } +bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } + Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 1901ffbe57..aded1f771c 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -27,6 +27,7 @@ namespace paddle { namespace framework { int64_t GetEagerDeletionThreshold(); +bool IsFastEagerDeletionModeEnabled(); class Scope; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b..3a4c52410e 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -153,7 +153,7 @@ class Tensor { void set_layout(const DataLayout layout) { layout_ = layout; } - void clear() { holder_ = nullptr; } + void clear() { holder_.reset(); } const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 6c1b2f329a..d8410b4058 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -59,7 +59,21 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto ctx = executor.Prepare(*program, block->ID()); + auto &skip_eager_deletion_vars = + Attr>("skip_eager_deletion_vars"); + if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { + std::string debug_string = + "Skip " + std::to_string(skip_eager_deletion_vars.size()) + + " vars in eager deletion mode: "; + for (auto &var : skip_eager_deletion_vars) { + debug_string.append(var); + debug_string.push_back(' '); + } + VLOG(10) << debug_string; + } + + auto ctx = + executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -96,6 +110,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); + AddAttr>("skip_eager_deletion_vars", + "Vars that would skip eager deletion." + "Users should not set this manually.") + .SetDefault(std::vector()); AddComment(R"DOC( )DOC"); } @@ -341,6 +359,30 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); + /* The following codes are used in eager deletion mode */ + if (framework::GetEagerDeletionThreshold() >= 0) { + std::unordered_set skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + // If input var of ops inside grad_block is not from grad_block, + // it cannot be deleted when forward while_op runs + if (in_arg_name != framework::kEmptyVarName && + !grad_block->HasVar(in_arg_name)) { + skip_vars.insert(in_arg_name); + } + } + } + + if (!skip_vars.empty()) { + // FIXME(zjl): ugly const_cast here, maybe we should find a better way + // to modify forward while_op + auto &fwd_while_op = const_cast(ForwardOp()); + fwd_while_op.SetAttr( + "skip_eager_deletion_vars", + std::vector(skip_vars.begin(), skip_vars.end())); + } + } + return std::unique_ptr(while_grad); } }; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae1..7fc07efe73 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -16,6 +16,7 @@ #include +#include #include // NOLINT #include #include @@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = - file_list_.size() > thread_num ? thread_num : file_list_.size(); + thread_num_ = std::min(file_list_.size(), thread_num); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { - read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, - thread_id, &read_thread_status_, queue_))); + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { + read_threads_.emplace_back(new std::thread(std::bind( + &ReadThread, file_groups_[thread_id], slots_, batch_size_, + static_cast(thread_id), &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( std::bind(&MonitorThread, &read_thread_status_, queue_))); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 3edd727978..37453a8c29 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -223,14 +223,10 @@ class CUDADeviceContext : public DeviceContext { template void AddStreamCallback(Callback&& callback) const { - std::lock_guard guard(callback_mtx_); callback_manager_->AddCallback(callback); } - void WaitStreamCallback() const { - std::lock_guard guard(callback_mtx_); - callback_manager_->Wait(); - } + void WaitStreamCallback() const { callback_manager_->Wait(); } #if CUDA_VERSION >= 9000 /*! \brief CublasCall may need to change cublas's config, @@ -261,9 +257,7 @@ class CUDADeviceContext : public DeviceContext { mutable std::mutex mtx_; - // This lock is only used by callback - // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes - mutable std::mutex callback_mtx_; + // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; mutable std::mutex cublas_mtx_; diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index ae915365f8..58ec6f2f5d 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -18,52 +18,47 @@ namespace paddle { namespace platform { -struct StreamCallbackContext { - inline StreamCallbackContext(const StreamCallbackManager *manager, - std::function callback) - : manager_(manager), callback_(std::move(callback)) {} - - const StreamCallbackManager *manager_; // do not own - std::function callback_; -}; +#if CUDA_VERSION >= 10000 +static void CUDART_CB StreamCallbackFunc(void *user_data); +#else +static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, void *user_data) +#endif +{ + std::unique_ptr> func( + reinterpret_cast *>(user_data)); + (*func)(); +} StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) - : stream_(stream), thread_pool_(new ::ThreadPool(1)) {} + : stream_(stream), thread_pool_(1) {} void StreamCallbackManager::AddCallback(std::function callback) const { - auto *stream_callback_context = - new StreamCallbackContext(this, std::move(callback)); + auto *callback_func = new std::function(std::move(callback)); + auto *func = new std::function([this, callback_func] { + std::lock_guard lock(mtx_); + last_future_ = thread_pool_.enqueue([callback_func] { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); + }); #if CUDA_VERSION >= 10000 - PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context)); + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); #else - PADDLE_ENFORCE( - cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0)); + PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif } -void StreamCallbackManager::Wait() const { - thread_pool_.reset(new ::ThreadPool(1)); -} +StreamCallbackManager::~StreamCallbackManager() { Wait(); } -#if CUDA_VERSION >= 10000 -void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data) -#else -void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, - void *user_data) -#endif -{ - auto *callback_context_ptr = - reinterpret_cast(user_data); - callback_context_ptr->manager_->thread_pool_->enqueue( - [callback_context_ptr]() { - std::unique_ptr callback_context( - callback_context_ptr); - callback_context->callback_(); - }); +void StreamCallbackManager::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + { + std::lock_guard lock(mtx_); + if (last_future_.valid()) { + last_future_.wait(); + } + } } } // namespace platform diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index eac4806d13..0d5d85bf46 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,30 +18,32 @@ #include #include #include +#include // NOLINT #include +#include // NOLINT + +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { -// NOTE(zjl): clean StreamCallback to make compilation faster +// NOTE(zjl): clean StreamCallbackManager to make compilation faster +// Make StreamCallbackManager thread-safe class StreamCallbackManager { public: explicit StreamCallbackManager(const cudaStream_t stream); + ~StreamCallbackManager(); + void AddCallback(std::function callback) const; void Wait() const; private: const cudaStream_t stream_; - mutable std::unique_ptr<::ThreadPool> thread_pool_; - -#if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data); -#else - static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data); -#endif + mutable ::ThreadPool thread_pool_; + mutable std::mutex mtx_; + mutable std::future last_future_; }; } // namespace platform diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 02a75236f6..24800e1709 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f7fefb3e5b..2690149e9b 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,8 +116,9 @@ def __bootstrap__(): 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", - 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', + 'allocator_strategy', 'reader_queue_speed_test_mode', + 'print_sub_graph_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') From ac803fed184859c83c5ca860f73ae92f74876a06 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Mon, 3 Dec 2018 20:31:16 +0800 Subject: [PATCH 096/719] Fix the compile issue for cuda device (test=develop) --- .../fluid/tests/unittests/test_conv3d_op.py | 58 ++++++------------- 1 file changed, 17 insertions(+), 41 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 216d3ad2d0..c6b749fe09 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -112,59 +112,35 @@ class TestConv3dOp(OpTest): return core.is_compiled_with_cuda() and self.use_cudnn def test_check_output(self): - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-5) - else: - self.check_output() + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_output_with_place(place, atol=1e-5) def test_check_grad(self): if self.dtype == np.float16: return - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - set(['Input', 'Filter']), - 'Output', - max_relative_error=0.03) - else: - self.check_grad( - set(['Input', 'Filter']), 'Output', max_relative_error=0.03) + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_grad_with_place( + place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03) def test_check_grad_no_filter(self): if self.dtype == np.float16: return - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter'])) - else: - self.check_grad( - ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter'])) + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter'])) def test_check_grad_no_input(self): if self.dtype == np.float16: return - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input'])) - else: - self.check_grad( - ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input'])) + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input'])) def init_test_case(self): self.pad = [0, 0, 0] From 0591ba96ec10364eb5f9129b75b667ffc82ff5dc Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 3 Dec 2018 20:35:15 +0800 Subject: [PATCH 097/719] fix hack test=develop --- paddle/fluid/framework/ir/graph.h | 7 +++---- .../analysis/ir_passes/tensorrt_subgraph_pass.cc | 11 ++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 947c934f0f..bb2d953afb 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -177,14 +177,13 @@ class Graph { return nullptr; } - const ProgramDesc &program() const { return program_; } - std::map> InitFromProgram( - const ProgramDesc &program); - void ResolveHazard( const std::map> &var_nodes); private: + std::map> InitFromProgram( + const ProgramDesc &program); + // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index c6b7c05f78..4ffe5f575c 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, output_mapping.push_back(output_name_map[name]); } - *block_desc.Proto()->mutable_vars() = - const_cast(&graph->program()) - ->Proto() - ->blocks(0) - .vars(); + auto *vars = block_desc.Proto()->mutable_vars(); + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + *vars->Add() = *node->Var()->Proto(); + } + } PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); From abbe382e1e683fa183256dd0f927057b7c8d9155 Mon Sep 17 00:00:00 2001 From: zhang wenhui Date: Mon, 3 Dec 2018 20:46:36 +0800 Subject: [PATCH 098/719] Revert "Add EstiminateFlops" --- paddle/fluid/framework/details/op_registry.h | 20 +++----------------- paddle/fluid/framework/op_info.h | 7 ------- paddle/fluid/framework/type_defs.h | 2 -- 3 files changed, 3 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 1ce18c3d6b..eea7e712f8 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -32,9 +32,7 @@ enum OpInfoFillType { kOpProtoAndCheckerMaker = 1, kGradOpDescMaker = 2, kVarTypeInference = 3, - kShapeInference = 4, - kEstimateFlops = 5, - kUnknown = -1 + kShapeInference = 4 }; template @@ -50,10 +48,8 @@ struct OpInfoFillTypeID { ? kVarTypeInference : (std::is_base_of::value ? kShapeInference - : (std::is_base_of::value - ? kEstimateFlops - : kUnknown))))); + : static_cast( + -1))))); } }; @@ -143,16 +139,6 @@ struct OpInfoFiller { } }; -template -struct OpInfoFiller { - void operator()(const char* op_tpe, OpInfo* info) const { - info->estimate_flops_ = [](InferShapeContext* ctx) { - T estimate_flops; - return estimate_flops(ctx); - }; - } -}; - } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index e0bf5ed999..19e5c2c73e 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -31,12 +31,6 @@ class InferShapeBase { virtual void operator()(InferShapeContext*) const = 0; }; -class EstimateFlopsBase { - public: - virtual ~EstimateFlopsBase() = default; - virtual size_t operator()(InferShapeContext*) const = 0; -}; - struct OpInfo { OpCreator creator_; GradOpMakerFN grad_op_maker_; @@ -44,7 +38,6 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; - EstimateFlopsFN estimate_flops_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index cdc5fa6862..2de6233a9e 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -54,7 +54,5 @@ using InferVarTypeFN = using InferShapeFN = std::function; -using EstimateFlopsFN = std::function; - } // namespace framework } // namespace paddle From 65867d8989743fabf04cb8e27e58d8cb47fa2b9b Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 3 Dec 2018 21:17:50 +0800 Subject: [PATCH 099/719] test=develop --- paddle/fluid/operators/sequence_ops/sequence_mask_op.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h index 18acb735ce..8fceed3558 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h @@ -36,12 +36,10 @@ class SequenceMaskOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist"); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist"); - auto maxlen = ctx->Attrs().Get("maxlen"); - if (maxlen > 0) { // We can only infershape when maxlen > 0 - auto dim = framework::vectorize2int(ctx->GetInputDim("X")); - dim.push_back(maxlen); - ctx->SetOutputDim("Y", framework::make_ddim(dim)); - } + int maxlen = ctx->Attrs().Get("maxlen"); + auto dim = framework::vectorize2int(ctx->GetInputDim("X")); + dim.push_back(maxlen > 0 ? maxlen : -1); + ctx->SetOutputDim("Y", framework::make_ddim(dim)); } }; From 35a2578426840642acc0b2100be0b1c96c2cf1e9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 3 Dec 2018 13:21:49 +0000 Subject: [PATCH 100/719] fix bug test=develop --- .../framework/details/computation_op_handle.cc | 2 -- .../framework/details/reference_count_pass.cc | 14 +++++++++----- paddle/fluid/platform/stream_callback_manager.cc | 2 -- paddle/fluid/platform/stream_callback_manager.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 2bf43fd4e0..7beb8c8de9 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -31,8 +31,6 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); - VLOG(10) << "Run Op" << Name(); - auto run_func = [this]() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 2320d3926a..0c096e0980 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,7 +29,7 @@ namespace paddle { namespace framework { namespace details { -struct OpConnectionDetector { +class OpConnectionDetector { public: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; @@ -37,8 +37,8 @@ struct OpConnectionDetector { : graph_(all_ops) {} template - std::unordered_set MaxNoDepOps( - const OpSet &op_set) { + OpSet MaxNoDepOps(const OpSet &op_set) { + if (op_set.size() <= 1) return op_set; using KeyType = typename OpSet::key_type; static_assert( std::is_base_of ops(op_set.begin(), op_set.end()); - std::unordered_set ret; + OpSet ret; auto rels = GetRelations(ops); auto not_before = [](RelationShip r) { return r != kBefore; }; for (size_t i = 0; i < rels.size(); ++i) { @@ -79,7 +79,7 @@ struct OpConnectionDetector { auto it = op_to_idx.find(op); if (it != op_to_idx.end()) { size_t j = it->second; - if (ret[i][j] != kSame) { + if (i != j && ret[i][j] == kSame) { ret[i][j] = kBefore; ret[j][i] = kAfter; found_num += 2; @@ -208,6 +208,10 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( VLOG(10) << "Shrink last living op number of " << var_name << " from " << original_size << " to " << last_live_op.size(); } + + PADDLE_ENFORCE(!last_live_op.empty(), + "Last living ops of %s cannot be empty", var_name); + ref_cnts[i].emplace(var_name, last_live_op.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); } diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 58ec6f2f5d..466c77469e 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -49,8 +49,6 @@ void StreamCallbackManager::AddCallback(std::function callback) const { #endif } -StreamCallbackManager::~StreamCallbackManager() { Wait(); } - void StreamCallbackManager::Wait() const { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); { diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 0d5d85bf46..8668bcb113 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -33,7 +33,7 @@ class StreamCallbackManager { public: explicit StreamCallbackManager(const cudaStream_t stream); - ~StreamCallbackManager(); + ~StreamCallbackManager() = default; void AddCallback(std::function callback) const; From 9da5954a21ef1ad85f3fe98fdabd592e7bd9c43c Mon Sep 17 00:00:00 2001 From: lujun Date: Mon, 3 Dec 2018 21:48:48 +0800 Subject: [PATCH 101/719] fix mac ci test step, test=develop --- paddle/scripts/paddle_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 62a2b08a8a..29dc5ddcc2 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -437,7 +437,7 @@ EOF export http_proxy= export https_proxy= # TODO: jiabin need to refine this part when these tests fixed on mac - ctest --output-on-failure -j $1 + ctest --output-on-failure -j $2 # make install should also be test when unittest make install -j 8 if [ "$1" == "cp27-cp27m" ]; then @@ -903,7 +903,7 @@ function main() { maccheck) cmake_gen ${PYTHON_ABI:-""} build_mac - run_mac_test ${PROC_RUN:-1} + run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1} ;; macbuild) cmake_gen ${PYTHON_ABI:-""} From da4e0bf1a1ba496fe9d9f0f05d8b8be84296ac8b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 4 Dec 2018 09:40:39 +0800 Subject: [PATCH 102/719] add 2 more files test=develop --- paddle/scripts/paddle_build.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 4175ee47b2..aaa484a61b 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -497,7 +497,9 @@ function assert_api_spec_approvals() { "paddle/fluid/framework/var_desc.h" "paddle/fluid/framework/scope.h" "paddle/fluid/framework/ir/node.h" - "paddle/fluid/framework/ir/graph.h") + "paddle/fluid/framework/ir/graph.h" + "paddle/fluid/framework/framework.proto" + "paddle/fluid/operators/distributed/send_recv.proto.in") for API_FILE in ${API_FILES[*]}; do API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true` echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" From deb04809bdd8805b4feb2b845abbd0c5876e8b93 Mon Sep 17 00:00:00 2001 From: ZongwuYang Date: Tue, 4 Dec 2018 10:42:59 +0800 Subject: [PATCH 103/719] test=develop Fix the bug that profiler cannot trace the nccl allreduce operator --- paddle/fluid/platform/device_tracer.cc | 19 ++++++++++--------- paddle/fluid/platform/device_tracer.h | 6 ++++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index ea4564058d..86f18b7a80 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { auto *kernel = reinterpret_cast(record); - tracer->AddKernelRecords(kernel->start, kernel->end, + tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end, kernel->deviceId, kernel->streamId, kernel->correlationId); break; @@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer { stream_id, correlation_id, bytes}); } - void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, - int64_t stream_id, uint32_t correlation_id) { + void AddKernelRecords(std::string name, uint64_t start, uint64_t end, + int64_t device_id, int64_t stream_id, + uint32_t correlation_id) { // 0 means timestamp information could not be collected for the kernel. if (start == 0 || end == 0) { VLOG(30) << correlation_id << " cannot be traced"; @@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer { } std::lock_guard l(trace_mu_); kernel_records_.push_back( - KernelRecord{start, end, device_id, stream_id, correlation_id}); + KernelRecord{name, start, end, device_id, stream_id, correlation_id}); } bool IsEnabled() { @@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer { profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); for (const KernelRecord &r : kernel_records_) { - if (correlations_.find(r.correlation_id) == correlations_.end()) { - fprintf(stderr, "cannot relate a kernel activity\n"); - continue; - } auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); - event->set_name(correlations_.at(r.correlation_id)); + if (correlations_.find(r.correlation_id) != correlations_.end()) { + event->set_name(correlations_.at(r.correlation_id)); + } else { + event->set_name(r.name); + } event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); event->set_sub_device_id(r.stream_id); diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index eaf047d474..bf0786be2d 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() { class DeviceTracer { public: struct KernelRecord { + std::string name; uint64_t start_ns; uint64_t end_ns; int64_t device_id; @@ -84,8 +85,9 @@ class DeviceTracer { // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. - virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, - int64_t stream_id, uint32_t correlation_id) = 0; + virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end, + int64_t device_id, int64_t stream_id, + uint32_t correlation_id) = 0; // Generate a proto after done (Disabled). virtual proto::Profile GenProfile(const std::string& profile_path) = 0; From 2301abc481bbcfce1f87a102c295df8eeb4ba6c4 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 4 Dec 2018 11:32:10 +0800 Subject: [PATCH 104/719] cc libaray add pslib --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 9f5631b87c..8556dcbc36 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,7 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib) cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) From 6d04a9cf4779a63123c5c1254450dbf881961b1f Mon Sep 17 00:00:00 2001 From: Tink_Y <31891223+tink2123@users.noreply.github.com> Date: Tue, 4 Dec 2018 13:07:10 +0800 Subject: [PATCH 105/719] fix api format and example (#14686) * fix api format and examples test=develop * Update executor.py test=develop * Update nn.py * Update nn.py test=develop * Update nn.py test=develop --- python/paddle/fluid/clip.py | 16 ++-- python/paddle/fluid/executor.py | 17 ++-- python/paddle/fluid/framework.py | 5 +- python/paddle/fluid/layers/io.py | 13 ++- .../fluid/layers/learning_rate_scheduler.py | 15 +-- python/paddle/fluid/layers/nn.py | 96 ++++++++++--------- python/paddle/fluid/metrics.py | 31 +++--- python/paddle/fluid/param_attr.py | 3 +- .../fluid/transpiler/distribute_transpiler.py | 71 +++++++------- 9 files changed, 143 insertions(+), 124 deletions(-) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 1738afe93e..5b8ff0514e 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -134,12 +134,12 @@ class GradientClipByValue(BaseGradientClipAttr): Examples: .. code-block:: python - w_param_attrs = ParamAttr(name=None, - initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + w_param_attrs = fluid.ParamAttr(name=None, + initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0), learning_rate=1.0, - regularizer=L1Decay(1.0), + regularizer=fluid.regularizer.L1Decay(1.0), trainable=True, - clip=GradientClipByValue(-1.0, 1.0)) + clip=fluid.clip.GradientClipByValue(-1.0, 1.0)) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) """ @@ -185,12 +185,12 @@ class GradientClipByNorm(BaseGradientClipAttr): Examples: .. code-block:: python - w_param_attrs = ParamAttr(name=None, - initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + w_param_attrs = flui.ParamAttr(name=None, + initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0), learning_rate=1.0, - regularizer=L1Decay(1.0), + regularizer=fluid.regularizer.L1Decay(1.0), trainable=True, - clip=GradientClipByNorm(clip_norm=2.0)) + clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) """ diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 42c2484b28..f2886090d7 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -20,7 +20,7 @@ import six from .framework import Program, default_main_program, Variable from . import core -__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope'] +__all__ = ['Executor', 'global_scope', 'scope_guard'] g_scope = core.Scope() @@ -407,16 +407,17 @@ class Executor(object): Examples: - >>> data = layers.data(name='X', shape=[1], dtype='float32') - >>> hidden = layers.fc(input=data, size=10) - >>> layers.assign(hidden, out) - >>> loss = layers.mean(out) + >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32') + >>> out = fluid.layers.create_tensor(dtype='float32') + >>> hidden = fluid.layers.fc(input=data, size=10) + >>> fluid.layers.assign(hidden,out) + >>> loss = fluid.layers.mean(out) >>> adam = fluid.optimizer.Adam() - >>> adam.minimize(loss) + >>> adam.minimize(loss) >>> cpu = core.CPUPlace() - >>> exe = Executor(cpu) - >>> exe.run(default_startup_program()) + >>> exe = fluid.Executor(cpu) + >>> exe.run(fluid.default_startup_program()) >>> x = numpy.random.random(size=(10, 1)).astype('float32') >>> outs = exe.run( diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b991187d42..b156db53d2 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -89,12 +89,13 @@ def name_scope(prefix=None): Examples: .. code-block:: python + with name_scope("encoder"): ... with name_scope("decoder"): ... - with name_scope("attention"): - ... + with name_scope("attention"): + ... """ # TODO(panyx0718): Only [0-9a-z]. assert prefix, "namescope prefix cannot be empty." diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 3f47053961..42f4959a83 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -943,7 +943,18 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None): def shuffle(reader, buffer_size): """ - Shuffle the reader. + Creates a data reader whose data output is shuffled. + Output from the iterator that created by original reader will be + buffered into shuffle buffer, and then shuffled. The size of shuffle buffer + is determined by argument buf_size. + + Args: + param reader: the original reader whose output will be shuffled. + type reader: callable + param buf_size: shuffle buffer size. + type buf_size: int + return: the new reader whose output is shuffled. + rtype: callable """ return __create_unshared_decorated_reader__( 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 149224bb68..dde0518972 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -308,13 +308,9 @@ def piecewise_decay(boundaries, values): def append_LARS(params_grads, learning_rate, weight_decay): - """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for - each layer. - - ```python - learning_rate *= local_gw_ratio * sqrt(sumsq(param)) - / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) - ``` + """ + Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for + each layer. Args: learning_rate: A learning rate Variable. This @@ -323,6 +319,11 @@ def append_LARS(params_grads, learning_rate, weight_decay): Returns: The decayed learning rate + Examples: + .. code-block:: python + + learning_rate *= local_gw_ratio * sqrt(sumsq(param)) + / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) """ def _balanced_weight(param_norm, grad_norm): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ac1c865775..28b8ae895a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -928,7 +928,7 @@ def dynamic_gru(input, emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim = 512 x = fluid.layers.fc(input=emb, size=hidden_dim * 3) - hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim) + hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim) """ helper = LayerHelper('gru', **locals()) @@ -3586,6 +3586,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None): Examples: .. code-block:: python + # Suppose `ids` and `scores` are LodTensorArray variables reserving # the selected ids and scores of all steps finished_ids, finished_scores = layers.beam_search_decode( @@ -5083,7 +5084,7 @@ def im2sequence(input, output.lod = [[4, 4]] - Examples: + Examples: .. code-block:: python @@ -5870,24 +5871,23 @@ def pad_constant_like(x, y, pad_value=0., name=None): [[38, 39, 40]], [[41, 42, 43]]]] Y.shape = (1, 3, 1, 3) + And + pad_value = -1, - And - pad_value = -1, - - Return: - Out = [[[[35, 36, 37], - [-1, -1, -1]], - [[38, 39, 40], - [-1, -1, -1]], - [[41, 42, 43], - [-1, -1, -1]]], - [[[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]]]] - Out.shape = (2, 3, 2, 3) + Return: + Out = [[[[35, 36, 37], + [-1, -1, -1]], + [[38, 39, 40], + [-1, -1, -1]], + [[41, 42, 43], + [-1, -1, -1]]], + [[[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]]]] + Out.shape = (2, 3, 2, 3) Args: x (Variable): The input tensor variable. @@ -6126,6 +6126,7 @@ def image_resize(input, Supporting resample methods: 'BILINEAR' : Bilinear interpolation + 'NEAREST' : Nearest neighbor interpolation Args: @@ -6781,7 +6782,7 @@ def crop(x, shape=None, offsets=None, name=None): # or z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32") - crop = fluid.layers.crop(z, shape=[2, 3]) + crop = fluid.layers.crop(z, shape=[-1, 2, 3]) """ helper = LayerHelper('crop', **locals()) @@ -7062,39 +7063,40 @@ def pad2d(input, than height-1. And the width dimension has the same condition. Example: + .. code-block:: text - Given that X is a channel of image from input: + Given that X is a channel of image from input: - X = [[1, 2, 3], - [4, 5, 6]] + X = [[1, 2, 3], + [4, 5, 6]] - Case 0: + Case 0: - paddings = [0, 1, 2, 3], - mode = 'constant' - pad_value = 0 + paddings = [0, 1, 2, 3], + mode = 'constant' + pad_value = 0 - Out = [[0, 0, 1, 2, 3, 0, 0, 0] - [0, 0, 4, 5, 6, 0, 0, 0] - [0, 0, 0, 0, 0, 0, 0, 0]] + Out = [[0, 0, 1, 2, 3, 0, 0, 0] + [0, 0, 4, 5, 6, 0, 0, 0] + [0, 0, 0, 0, 0, 0, 0, 0]] - Case 1: + Case 1: - paddings = [0, 1, 2, 1], - mode = 'reflect' + paddings = [0, 1, 2, 1], + mode = 'reflect' - Out = [[3, 2, 1, 2, 3, 2] - [6, 5, 4, 5, 6, 5] - [3, 2, 1, 2, 3, 2]] + Out = [[3, 2, 1, 2, 3, 2] + [6, 5, 4, 5, 6, 5] + [3, 2, 1, 2, 3, 2]] - Case 2: + Case 2: - paddings = [0, 1, 2, 1], - mode = 'edge' + paddings = [0, 1, 2, 1], + mode = 'edge' - Out = [[1, 1, 1, 2, 3, 3] - [4, 4, 4, 5, 6, 6] - [4, 4, 4, 5, 6, 6]] + Out = [[1, 1, 1, 2, 3, 3] + [4, 4, 4, 5, 6, 6] + [4, 4, 4, 5, 6, 6]] Args: @@ -7332,13 +7334,13 @@ def prelu(x, mode, param_attr=None, name=None): Args: x (Variable): The input tensor. param_attr(ParamAttr|None): The parameter attribute for the learnable - weight (alpha). + weight (alpha). mode (string): The mode for weight sharing. It supports all, channel - and element. all: all elements share same weight - channel:elements in a channel share same weight - element:each element has a weight + and element. all: all elements share same weight + channel:elements in a channel share same weight + element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 829154f1b2..85af8fea13 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -222,13 +222,13 @@ class Precision(MetricBase): Examples: .. code-block:: python - metric = fluid.metrics.Precision() - for pass in range(PASSES): - metric.reset() - for data in train_reader(): - loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) - numpy_precision = metric.eval() + metric = fluid.metrics.Precision() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_precision = metric.eval() """ def __init__(self, name=None): @@ -267,13 +267,13 @@ class Recall(MetricBase): Examples: .. code-block:: python - metric = fluid.metrics.Recall() - for pass in range(PASSES): - metric.reset() - for data in train_reader(): - loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) - numpy_recall = metric.eval() + metric = fluid.metrics.Recall() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_recall = metric.eval() """ def __init__(self, name=None): @@ -449,8 +449,9 @@ class EditDistance(MetricBase): distance_evaluator.update(distances, seq_num) distance, instance_error = distance_evaluator.eval() - In the above example: + In the above example: 'distance' is the average of the edit distance in a pass. + 'instance_error' is the instance error rate in a pass. """ diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index a51607bfdb..38ddf93198 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -50,8 +50,9 @@ class ParamAttr(object): w_param_attrs = fluid.ParamAttr(name="fc_weight", learning_rate=0.5, - regularizer=fluid.L2Decay(1.0), + regularizer=fluid.regularizer.L2Decay(1.0), trainable=True) + x = fluid.layers.data(name='X', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs) """ diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 5d348f0995..1d867d9194 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -125,13 +125,14 @@ def slice_variable(var_list, slice_count, min_block_size): class DistributeTranspilerConfig(object): """ - slice_var_up (bool): Do Tensor slice for pservers, default is True. - split_method (PSDispatcher): RoundRobin or HashName can be used - try to choose the best method to balance loads for pservers. - min_block_size (int): Minimum splitted element number in block. - According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 - We can use bandwidth effiently when data size is larger than 2MB.If you - want to change it, please be sure you see the slice_variable function. + Args: + slice_var_up (bool): Do Tensor slice for pservers, default is True. + split_method (PSDispatcher): RoundRobin or HashName can be used + try to choose the best method to balance loads for pservers. + min_block_size (int): Minimum splitted element number in block. + According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 + We can use bandwidth effiently when data size is larger than 2MB.If you + want to change it, please be sure you see the slice_variable function. """ slice_var_up = True @@ -163,35 +164,35 @@ class DistributeTranspiler(object): Examples: .. code-block:: python - # for pserver mode - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - current_endpoint = "192.168.0.1:6174" - trainer_id = 0 - trainers = 4 - role = os.getenv("PADDLE_TRAINING_ROLE") - - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) - if role == "PSERVER": - pserver_program = t.get_pserver_program(current_endpoint) - pserver_startup_program = t.get_startup_program(current_endpoint, + # for pserver mode + pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + current_endpoint = "192.168.0.1:6174" + trainer_id = 0 + trainers = 4 + role = os.getenv("PADDLE_TRAINING_ROLE") + + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id, pservers=pserver_endpoints, trainers=trainers) + if role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, pserver_program) - elif role == "TRAINER": - trainer_program = t.get_trainer_program() - - # for nccl2 mode - config = fluid.DistributeTranspilerConfig() - config.mode = "nccl2" - t = fluid.DistributeTranspiler(config=config) - t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) - exe = fluid.ParallelExecutor( - use_cuda, - loss_name=loss_var.name, - num_trainers=len(trainers.split(",)), - trainer_id=trainer_id - ) + elif role == "TRAINER": + trainer_program = t.get_trainer_program() + + # for nccl2 mode + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss_var.name, + num_trainers=len(trainers.split(",)), + trainer_id=trainer_id + ) """ def __init__(self, config=None): From b387a194106d6b7037a82cf7d23a6c3ce92a77bc Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 29 Nov 2018 08:52:43 +0000 Subject: [PATCH 106/719] optimize op with blas --- .../operators/hierarchical_sigmoid_op.cc | 1 + .../fluid/operators/hierarchical_sigmoid_op.h | 1 - .../fluid/operators/math/matrix_bit_code.cc | 57 +++++++++++++------ paddle/fluid/operators/math/matrix_bit_code.h | 4 ++ 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 972dcf5494..b326b58319 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -158,6 +158,7 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); } ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); } protected: diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 07ff8f947e..b73a32af89 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("W")); w_grad->set_rows(real_rows); // Build a map of id -> row_index to speed up finding the index of one id - w_grad->SyncIndex(); w_grad->set_height(w.dims()[0]); auto* w_grad_value = w_grad->mutable_value(); framework::DDim temp_dim(w.dims()); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 71b9293eed..5a6e64b6f8 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -89,6 +89,8 @@ template void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, const framework::Tensor& weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat->dims()[0]; size_t tmat_width = tmat->dims()[1]; size_t input_width = input.dims()[1]; @@ -99,13 +101,12 @@ void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_row = input_value + input_width * i; for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); + const T* weight_row = weight_value + weight_width * index; T sum = static_cast(0.0); - for (size_t k = 0; k < input_width; ++k) { - sum += weight_value[weight_width * index + k] * - input_value[input_width * i + k]; - } + sum = blas.DOT(input_width, weight_row, input_row); tmat_value[i * tmat_width + j] += sum; } } @@ -115,6 +116,8 @@ template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -122,16 +125,25 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto tmat_value = tmat.data(); auto weight_value = weight->data(); auto input_value = input.data(); + + std::unordered_map>> ops; + for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_value_row = input_value + input_width * i; + const T* tmat_row = tmat_value + i * tmat_width; for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - - for (size_t k = 0; k < input_width; ++k) { - weight_value[weight_width * index + k] += - tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; - } + ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } + } + for (auto& op : ops) { + auto& op_in_row = op.second; + for (auto& pair : op_in_row) { + auto& scale = pair.first; + auto* input_row = pair.second; + T* weight_row = weight_value + op.first * weight_width; + blas.AXPY(input_width, scale, input_row, weight_row); } } } @@ -140,6 +152,8 @@ template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, framework::SelectedRows* weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -147,17 +161,28 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto tmat_value = tmat.data(); auto weight_value = weight->mutable_value()->data(); auto input_value = input.data(); + + std::unordered_map>> ops; + ops.reserve(weight->rows().size()); + for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_value_row = input_value + input_width * i; + const T* tmat_row = tmat_value + i * tmat_width; for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - for (size_t k = 0; k < input_width; ++k) { - int64_t row_index = weight->GetIndexFromId(static_cast(index)); - weight_value[row_index * weight_width + k] += - tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; - } + ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } + } + + for (auto& row : weight->rows()) { + auto& op_in_row = ops[row]; + for (auto& pair : op_in_row) { + auto& scale = pair.first; + auto* input_row = pair.second; + blas.AXPY(input_width, scale, input_row, weight_value); } + weight_value += weight_width; } } diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index c30bb52641..35ca73802b 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -13,10 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" #if defined(_WIN32) From 6e67d0fb78b9dd23ced460d6b2bc54fcc5bde438 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Tue, 4 Dec 2018 14:38:44 +0800 Subject: [PATCH 107/719] layer fixes (#14591) * layer fixes test=develop * follow update test=develop --- .../paddle/fluid/layers/layer_function_generator.py | 11 ++++++++++- python/paddle/fluid/layers/tensor.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index eea0a362a0..09b1b30216 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -20,7 +20,7 @@ import string from six.moves import cStringIO from ..proto import framework_pb2 -from ..framework import OpProtoHolder, Variable +from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_ from ..layer_helper import LayerHelper __all__ = [ @@ -178,6 +178,15 @@ def generate_layer_fn(op_type): "operator {0} must input same dtype. {1} vs {2}".format( op_type, dtype, each.dtype)) + if dtype is None: + arg_dtype = kwargs.get("dtype") + if arg_dtype: + if not isinstance(arg_dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(arg_dtype) + else: + dtype = arg_dtype + else: + dtype = core.VarDesc.VarType.FP32 return dtype def func(*args, **kwargs): diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ff32c00104..49a486cf0c 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -622,7 +622,7 @@ def reverse(x, axis): out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='reverse', - inputs={'Input': x}, + inputs={'X': x}, outputs={'Out': [out]}, attrs={'axis': axis}) return out From 50a698525d6baf898db43ab552fd599acc3351b2 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 4 Dec 2018 16:33:10 +0800 Subject: [PATCH 108/719] Fix log level (#14692) --- paddle/fluid/memory/allocation/legacy_allocator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 26e2038a53..05b9a2cc08 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -86,7 +86,7 @@ struct NaiveAllocator { template <> void *Alloc(const platform::CPUPlace &place, size_t size) { - VLOG(1) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void *p = GetCPUBuddyAllocator()->Alloc(size); if (FLAGS_init_allocated_mem) { memset(p, 0xEF, size); @@ -97,7 +97,7 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { template <> void Free(const platform::CPUPlace &place, void *p) { - VLOG(1) << "Free pointer=" << p << " on " << platform::Place(place); + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } From 2d0d037d8e9e1580d38e800fd1a0d0b0056422eb Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 09:45:50 +0000 Subject: [PATCH 109/719] fix while_op eager deletion bug add unittest test=develop --- paddle/fluid/framework/executor.cc | 2 +- .../fluid/operators/controlflow/while_op.cc | 84 +++++++++++++------ .../unittests/test_eager_deletion_mnist.py | 27 ++++++ .../test_eager_deletion_seresnext.py | 27 ++++++ .../test_eager_deletion_transformer.py | 27 ++++++ 5 files changed, 140 insertions(+), 27 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5823f33034..f443c2d8cf 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -101,7 +101,7 @@ static void DeleteUnusedTensors( if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; + VLOG(2) << "Erase tensor \'" << name << "\'"; if (var->IsType()) { erase_tensors.insert(var->GetMutable()); } else if (var->IsType()) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index d8410b4058..da7cad82d8 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes"; static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT class WhileOp : public framework::OperatorBase { public: @@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto &skip_eager_deletion_vars = - Attr>("skip_eager_deletion_vars"); - if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { - std::string debug_string = - "Skip " + std::to_string(skip_eager_deletion_vars.size()) + - " vars in eager deletion mode: "; - for (auto &var : skip_eager_deletion_vars) { - debug_string.append(var); - debug_string.push_back(' '); - } - VLOG(10) << debug_string; + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); } - auto ctx = - executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); - AddAttr>("skip_eager_deletion_vars", + AddAttr>(kSkipEagerDeletionVars, "Vars that would skip eager deletion." "Users should not set this manually.") .SetDefault(std::vector()); @@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare(*program, block->ID()); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + } + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); @@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ + /* The followi_ng codes are used in eager deletion mode */ + std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set skip_vars; + std::unordered_set fwd_skip_vars; for (auto *op_desc : grad_block->AllOps()) { + auto skippable = [&](const std::string &name) { + return !grad_block->HasVar(name) && + (fwd_block->HasVarRecursive(name) || + parent_block->HasVarRecursive(name)); + }; for (auto &in_arg_name : op_desc->InputArgumentNames()) { - // If input var of ops inside grad_block is not from grad_block, - // it cannot be deleted when forward while_op runs - if (in_arg_name != framework::kEmptyVarName && - !grad_block->HasVar(in_arg_name)) { - skip_vars.insert(in_arg_name); + if (skippable(in_arg_name)) { + fwd_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (skippable(out_arg_name)) { + fwd_skip_vars.insert(out_arg_name); } } } - if (!skip_vars.empty()) { + if (!fwd_skip_vars.empty()) { // FIXME(zjl): ugly const_cast here, maybe we should find a better way // to modify forward while_op auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr( - "skip_eager_deletion_vars", - std::vector(skip_vars.begin(), skip_vars.end())); + fwd_while_op.SetAttr(kSkipEagerDeletionVars, + std::vector(fwd_skip_vars.begin(), + fwd_skip_vars.end())); + } + + // Find backward skip vars + auto fwd_input = Input(kX); + for (size_t i = 0; i < igs.size(); ++i) { + if (igs[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(igs[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); } } + while_grad->SetAttr( + kSkipEagerDeletionVars, + std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); return std::unique_ptr(while_grad); } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py new file mode 100644 index 0000000000..7ec1f0ae75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_mnist import TestMNIST + + +class EagerDeletionTestMNIST(TestMNIST): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py new file mode 100644 index 0000000000..2dcdbdb8f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_seresnext import TestResnet + + +class EagerDeletionTestSEResNext(TestResnet): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py new file mode 100644 index 0000000000..754d5fd409 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_transformer import TestTransformer + + +class EagerDeletionTestTransformer(TestTransformer): + pass + + +if __name__ == '__main__': + unittest.main() From e694d0c2e487a854103e0cc4796f92af6d27ccfd Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 09:45:50 +0000 Subject: [PATCH 110/719] fix while_op eager deletion bug add unittest test=develop --- .../details/eager_deletion_op_handle.cc | 2 + paddle/fluid/framework/executor.cc | 2 +- .../fluid/operators/controlflow/while_op.cc | 84 +++++++++++++------ .../unittests/test_eager_deletion_mnist.py | 27 ++++++ .../test_eager_deletion_seresnext.py | 27 ++++++ .../test_eager_deletion_transformer.py | 27 ++++++ 6 files changed, 142 insertions(+), 27 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 41f616035d..54715fed8d 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,9 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5823f33034..f443c2d8cf 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -101,7 +101,7 @@ static void DeleteUnusedTensors( if (--(it->second) == 0) { auto* var = scope.FindVar(name); if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; + VLOG(2) << "Erase tensor \'" << name << "\'"; if (var->IsType()) { erase_tensors.insert(var->GetMutable()); } else if (var->IsType()) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index d8410b4058..da7cad82d8 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes"; static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT class WhileOp : public framework::OperatorBase { public: @@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto &skip_eager_deletion_vars = - Attr>("skip_eager_deletion_vars"); - if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) { - std::string debug_string = - "Skip " + std::to_string(skip_eager_deletion_vars.size()) + - " vars in eager deletion mode: "; - for (auto &var : skip_eager_deletion_vars) { - debug_string.append(var); - debug_string.push_back(' '); - } - VLOG(10) << debug_string; + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); } - auto ctx = - executor.Prepare(*program, block->ID(), skip_eager_deletion_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); - AddAttr>("skip_eager_deletion_vars", + AddAttr>(kSkipEagerDeletionVars, "Vars that would skip eager deletion." "Users should not set this manually.") .SetDefault(std::vector()); @@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare(*program, block->ID()); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + if (framework::GetEagerDeletionThreshold() >= 0) { + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + } + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); @@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ + /* The followi_ng codes are used in eager deletion mode */ + std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set skip_vars; + std::unordered_set fwd_skip_vars; for (auto *op_desc : grad_block->AllOps()) { + auto skippable = [&](const std::string &name) { + return !grad_block->HasVar(name) && + (fwd_block->HasVarRecursive(name) || + parent_block->HasVarRecursive(name)); + }; for (auto &in_arg_name : op_desc->InputArgumentNames()) { - // If input var of ops inside grad_block is not from grad_block, - // it cannot be deleted when forward while_op runs - if (in_arg_name != framework::kEmptyVarName && - !grad_block->HasVar(in_arg_name)) { - skip_vars.insert(in_arg_name); + if (skippable(in_arg_name)) { + fwd_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (skippable(out_arg_name)) { + fwd_skip_vars.insert(out_arg_name); } } } - if (!skip_vars.empty()) { + if (!fwd_skip_vars.empty()) { // FIXME(zjl): ugly const_cast here, maybe we should find a better way // to modify forward while_op auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr( - "skip_eager_deletion_vars", - std::vector(skip_vars.begin(), skip_vars.end())); + fwd_while_op.SetAttr(kSkipEagerDeletionVars, + std::vector(fwd_skip_vars.begin(), + fwd_skip_vars.end())); + } + + // Find backward skip vars + auto fwd_input = Input(kX); + for (size_t i = 0; i < igs.size(); ++i) { + if (igs[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(igs[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); } } + while_grad->SetAttr( + kSkipEagerDeletionVars, + std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); return std::unique_ptr(while_grad); } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py new file mode 100644 index 0000000000..7ec1f0ae75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_mnist import TestMNIST + + +class EagerDeletionTestMNIST(TestMNIST): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py new file mode 100644 index 0000000000..2dcdbdb8f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_seresnext import TestResnet + + +class EagerDeletionTestSEResNext(TestResnet): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py new file mode 100644 index 0000000000..754d5fd409 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_transformer import TestTransformer + + +class EagerDeletionTestTransformer(TestTransformer): + pass + + +if __name__ == '__main__': + unittest.main() From 968dd3c078dc5b4cffde2b694384064c5993a0d7 Mon Sep 17 00:00:00 2001 From: liuhongyu Date: Tue, 4 Dec 2018 18:38:47 +0800 Subject: [PATCH 111/719] add cudnn 5 support; test=develop --- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 9 +++++++++ paddle/fluid/platform/dynload/cudnn.h | 10 ++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index e01070c7b8..2c9800b886 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -177,11 +177,20 @@ struct CudnnRNNCache { seed_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 213cd8a9ce..d18030dd76 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -125,8 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnRNNBackwardWeights); \ __macro(cudnnRNNForwardInference); \ __macro(cudnnDestroyDropoutDescriptor); \ - __macro(cudnnDestroyRNNDescriptor); \ - __macro(cudnnSetRNNDescriptor_v6); + __macro(cudnnDestroyRNNDescriptor); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) @@ -165,6 +164,13 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +// APIs in R6 +#if CUDNN_VERSION >= 6000 +#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) \ + __macro(cudnnSetRNNDescriptor_v6); +CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + #if CUDNN_VERSION >= 7001 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ __macro(cudnnSetConvolutionGroupCount); \ From 29d9fb53fcb2ca3d64ac7209d80bb40b974f3a9a Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Tue, 4 Dec 2018 19:50:33 +0800 Subject: [PATCH 112/719] [Feature] multi process multi gpu dist training, boost v100 performance by 20% (#14661) * wip multi process multi gpu dist training * workable for p2p * update test=develop * change back env name test=develop * fix alloc init * fix cpu build test=devlop * fix mac tests test=develop * refine code * refine test=develop --- .../framework/details/all_reduce_op_handle.cc | 7 ++++ .../fluid/framework/details/build_strategy.cc | 2 + .../details/multi_devices_graph_pass.cc | 8 +++- .../memory/allocation/legacy_allocator.cc | 24 +++++++----- paddle/fluid/platform/gpu_info.cc | 28 ++++++++++++++ paddle/fluid/platform/gpu_info.h | 4 ++ paddle/fluid/platform/init.cc | 20 ++++------ paddle/fluid/platform/nccl_helper.h | 17 ++++++--- paddle/fluid/string/CMakeLists.txt | 1 + paddle/fluid/string/split.h | 37 +++++++++++++++++++ paddle/fluid/string/split_test.cc | 28 ++++++++++++++ python/paddle/fluid/__init__.py | 2 +- python/paddle/fluid/parallel_executor.py | 9 ++++- 13 files changed, 156 insertions(+), 31 deletions(-) create mode 100644 paddle/fluid/string/split.h create mode 100644 paddle/fluid/string/split_test.cc diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index a003995ae3..e8bf53e160 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, void AllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); +// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, +// this is a distributed or inter-process call, find a better way. +#ifdef PADDLE_WITH_CUDA + if (NoDummyInputSize() == 1 && + local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { +#else if (NoDummyInputSize() == 1) { +#endif return; // No need to all reduce when GPU count = 1; } else { // Wait input done diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 523f9eadf2..1e1b945f63 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", &strategy_); + multi_devices_pass->Set("num_trainers", + new int(strategy_.num_trainers_)); // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 03f5f2e73a..cbae5321d9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -133,6 +133,7 @@ static const char kPlaces[] = "places"; static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; +static const char kNumTrainers[] = "num_trainers"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); @@ -299,6 +300,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; + int num_trainers = Get(kNumTrainers); + for (auto &node : nodes) { if (node->IsVar() && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); @@ -383,7 +386,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && places_.size() > 1) { + if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( @@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass, .RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kParams) .RequirePassAttr(paddle::framework::details::kLocalScopes) - .RequirePassAttr(paddle::framework::details::kStrategy); + .RequirePassAttr(paddle::framework::details::kStrategy) + .RequirePassAttr(paddle::framework::details::kNumTrainers); diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 05b9a2cc08..64aa63ffe9 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,11 +14,13 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include +#include #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/split.h" DEFINE_bool(init_allocated_mem, false, "It is a mistake that the values of the memory allocated by " @@ -110,19 +112,21 @@ size_t Used(const platform::CPUPlace &place) { BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { static std::once_flag init_flag; static detail::BuddyAllocator **a_arr = nullptr; + static std::vector devices; std::call_once(init_flag, [gpu_id]() { - int gpu_num = platform::GetCUDADeviceCount(); - PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id, - gpu_num); + devices = platform::GetSelectedDevices(); + int gpu_num = devices.size(); a_arr = new BuddyAllocator *[gpu_num]; - for (int i = 0; i < gpu_num; i++) { + for (size_t i = 0; i < devices.size(); ++i) { + int dev_id = devices[i]; a_arr[i] = nullptr; - platform::SetDeviceId(i); - a_arr[i] = new BuddyAllocator( - std::unique_ptr(new detail::GPUAllocator(i)), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + platform::SetDeviceId(dev_id); + a_arr[i] = new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE: each GPU device use " << FLAGS_fraction_of_gpu_memory_to_use * 100 @@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { }); platform::SetDeviceId(gpu_id); - return a_arr[gpu_id]; + auto pos = std::distance(devices.begin(), + std::find(devices.begin(), devices.end(), gpu_id)); + return a_arr[pos]; } #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 6954e4c6a9..ca89d91aad 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/split.h" #ifndef _WIN32 constexpr static float fraction_of_gpu_memory_to_use = 0.92f; @@ -45,6 +46,15 @@ DEFINE_bool( "input and output must be half precision) and recurrent neural networks " "(RNNs)."); +DEFINE_string(selected_gpus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (GPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" + "share-memory only."); + namespace paddle { namespace platform { @@ -121,6 +131,24 @@ int GetCurrentDeviceId() { return device_id; } +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices() { + // use user specified GPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_gpus.empty()) { + auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 6a0b3c8e02..1e1ab2503f 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include namespace paddle { namespace platform { @@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i); //! Get the current GPU device id in system. int GetCurrentDeviceId(); +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices(); + //! Set the GPU device id for next execution. void SetDeviceId(int device_id); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 258779ba51..51b46450e4 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/string/split.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) { std::vector devices; #ifdef PADDLE_WITH_CUDA try { - int count = platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } + // use user specified GPUs in single-node multi-process mode. + devices = platform::GetSelectedDevices(); } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } @@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) { void InitDevices(bool init_p2p, const std::vector devices) { std::vector places; - int count = 0; -#ifdef PADDLE_WITH_CUDA - try { - count = platform::GetCUDADeviceCount(); - } catch (const std::exception &exp) { - LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; - } -#endif for (size_t i = 0; i < devices.size(); ++i) { - if (devices[i] >= count || devices[i] < 0) { + // In multi process multi gpu mode, we may have gpuid = 7 + // but count = 1. + if (devices[i] < 0) { LOG(WARNING) << "Invalid devices id."; continue; } + places.emplace_back(platform::CUDAPlace(devices[i])); } if (init_p2p) { diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index fc903b548c..7c539d25f6 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -97,7 +97,7 @@ struct NCCLContextMap { order_.size(), contexts_.size(), "NCCL Context Map does not support contain two or more same device"); - if (places.size() <= 1) { + if (places.size() <= 1 && num_trainers == 1) { return; } std::unique_ptr comms(new ncclComm_t[order_.size()]); @@ -111,12 +111,19 @@ struct NCCLContextMap { { int nranks = num_trainers * order_.size(); NCCLGroupGuard gurad; - for (auto &gpu_id : order_) { - int rank = trainer_id * order_.size() + gpu_id; - VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks; + for (size_t i = 0; i < order_.size(); ++i) { + int gpu_id = order_[i]; + int rank; + if (order_.size() > 1) { + rank = trainer_id * order_.size() + i; + } else { + rank = trainer_id; + } + VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks + << "gpu id: " << gpu_id; PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( - comms.get() + gpu_id, nranks, *nccl_id, rank)); + comms.get() + i, nranks, *nccl_id, rank)); } } } diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 8572dc1e8e..169a925d12 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) +cc_test(split_test SRCS split_test.cc) diff --git a/paddle/fluid/string/split.h b/paddle/fluid/string/split.h new file mode 100644 index 0000000000..ccb96b8a9c --- /dev/null +++ b/paddle/fluid/string/split.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace paddle { +namespace string { + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/split_test.cc b/paddle/fluid/string/split_test.cc new file mode 100644 index 0000000000..c85dc1eed4 --- /dev/null +++ b/paddle/fluid/string/split_test.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/split.h" + +#include + +#include "gtest/gtest.h" + +TEST(StringSplit, StringSplit) { + std::string to_split = "0,1,2,3,4,5"; + int i = 0; + for (auto s : paddle::string::Split(to_split, ',')) { + EXPECT_EQ(atoi(s.c_str()), i); + i++; + } +} diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a1ffbf4262..2a53519188 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -147,7 +147,7 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search' + 'cudnn_exhaustive_search', 'selected_gpus' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index bdcd045341..dc27a8eabb 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -95,7 +95,14 @@ class ParallelExecutor(object): self._places = [] self._act_places = [] if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): + gpus = [] + gpus_env = os.getenv("FLAGS_selected_gpus") + if gpus_env: + gpus = [int(s) for s in gpus_env.split(",")] + else: + for i in six.moves.range(core.get_cuda_device_count()): + gpus.append(i) + for i in gpus: p = core.Place() self._act_places.append(core.CUDAPlace(i)) p.set_place(self._act_places[-1]) From 208f9125121bf5e7e314de787aa0802aa2e2f2bd Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Fri, 30 Nov 2018 15:24:04 +0100 Subject: [PATCH 113/719] Implement MKL-DNN Concat test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 217 ++++++++++++++++++ paddle/fluid/operators/concat_op.cc | 20 ++ .../tests/unittests/test_concat_mkldnn_op.py | 56 +++++ 3 files changed, 293 insertions(+) create mode 100644 paddle/fluid/operators/concat_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc new file mode 100644 index 0000000000..c6652b7885 --- /dev/null +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::concat; +using mkldnn::stream; +using platform::to_void_cast; + +// Generate keys for storing/retriving primitives for this operator +// TODO(jczaja): Make hashing function more optimial +static std::string gethash(const memory::dims& input_dims, + const std::string& pooling_type, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string& suffix) { + auto dims2str = [](const memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) + + dims2str(paddings) + pooling_type + suffix; +} + +static void EnforceLayouts(const std::vector inputs) { + for (auto* input : inputs) { + const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; + const bool is_format_defined = input->format() != + memory::format::format_undef; + PADDLE_ENFORCE(is_layout_correct && is_format_defined, + "Wrong layout/format set for Input tensor"); + } +} + +static memory::primitive_desc CreateMemPrimDesc( + const framework::Tensor& input, const mkldnn::engine& engine) { + constexpr auto data_type = mkldnn::memory::f32; + const auto dims = paddle::framework::vectorize2int(input.dims()); + const auto format = input.format(); + auto description = memory::desc(dims, data_type, format); + auto mem_prim_desc = memory::primitive_desc(description, engine); + return mem_prim_desc; +} + +static platform::CPUPlace GetCpuPlace( + const paddle::framework::ExecutionContext& ctx) { + auto place = ctx.GetPlace(); + PADDLE_ENFORCE(paddle::platform::is_cpu_place(place), + "It must use CPUPlace."); + return boost::get(place); +} + +template +class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto place = GetCpuPlace(ctx); + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto multi_input = ctx.MultiInput("X"); + framework::Tensor* output = ctx.Output("Out"); + int64_t concat_axis = static_cast(ctx.Attr("axis")); + + EnforceLayouts(multi_input); + + std::vector srcs_pd; + std::vector srcs; + for (size_t i = 0; i < multi_input.size(); i++) { + auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); + srcs_pd.push_back(mem_prim_desc); + srcs.push_back(memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); + } + auto dst_dims = paddle::framework::vectorize2int(output->dims()); + auto dst_desc = memory::desc(dst_dims, mkldnn::memory::f32, memory::format::any); + auto concat_pd = concat::primitive_desc(dst_desc, static_cast(concat_axis), srcs_pd); + auto dst_mem = memory(concat_pd.dst_primitive_desc(), output->mutable_data(place)); + + std::vector inputs; //= {srcs}; + inputs.reserve(srcs.size()); + for (size_t i = 0; i < srcs.size(); i++) { + inputs.push_back(srcs[i]); + } + auto concat_prim = concat(concat_pd, inputs, dst_mem); + + std::vector pipeline; + pipeline.push_back(concat_prim); + stream(stream::kind::eager).submit(pipeline).wait(); // TODO(mgallus): When this is not workin' split into decl and def + + /* + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + auto input_format = input->format(); + memory::format output_format{memory::format::format_undef}; + + const std::string key = gethash(src_tz, pooling_type, ksize, strides, + paddings, ctx.op().Output("Out")); + const std::string key_pool_p = key + "@pool_p"; + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; + const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + auto pool_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); + if (pool_p == nullptr) { + const std::vector& padding_left_top(paddings); + std::vector padding_right_bottom(paddings); + bool ceil_mode = ctx.Attr("ceil_mode"); + if (ceil_mode) { + CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, + padding_right_bottom); + } + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), input_format); + + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::any); + + std::shared_ptr pool_pd = + CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, + padding_right_bottom, ksize, pooling_type, + mkldnn_engine, ceil_mode, is_test); + + // save pool_pd into global device context to be referred in backward path + if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); + + auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), + to_void_cast(input_data)); + auto dst_memory = + std::make_shared(pool_pd->dst_primitive_desc(), output_data); + + dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); + dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); + + if (is_test) { + pool_p = std::make_shared(*pool_pd, *src_memory, + *dst_memory); + } else { + std::shared_ptr workspace_memory = + CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); + + // save pool_workspace_memory to be referred in backward path + dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); + + pool_p = std::make_shared( + *pool_pd, *src_memory, *dst_memory, *workspace_memory); + } + + dev_ctx.SetBlob(key_pool_p, pool_p); + + output_format = + (memory::format)dst_memory->get_primitive_desc().desc().data.format; + } else { + // Primitives already exist + auto pool_src_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); + PADDLE_ENFORCE(pool_src_memory_p != nullptr, + "Fail to find pooling src mem_p in device context"); + auto pool_dst_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); + PADDLE_ENFORCE(pool_dst_memory_p != nullptr, + "Fail to find pooling dst mem_p in device context"); + pool_src_memory_p->set_data_handle(to_void_cast(input_data)); + pool_dst_memory_p->set_data_handle(output_data); + + output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() + .desc() + .data.format; + } + + // push primitive to stream and wait until it's executed + std::vector pipeline{*(pool_p.get())}; + stream(stream::kind::eager).submit(pipeline).wait(); + */ + output->mutable_data(place); + output->set_layout(DataLayout::kMKLDNN); + output->set_format((memory::format)dst_mem.get_primitive_desc().desc() + .data.format); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConcatMKLDNNOpKernel) diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 57817da71a..7e58f9cde1 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include namespace paddle { namespace operators { @@ -59,6 +60,21 @@ class ConcatOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("X", /*->*/ "Out"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]); + + #ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } + #endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { @@ -66,6 +82,9 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input tensors of concat operator.").AsDuplicable(); AddOutput("Out", "Output tensor of concat operator."); + AddAttr("use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); AddAttr("axis", "The axis along which the input tensors will be concatenated.") .SetDefault(0); @@ -82,6 +101,7 @@ Examples: [5,6]] )DOC"); + } }; diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py new file mode 100644 index 0000000000..c590687a24 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3 + + +class TestMKLDNNConcatOp(TestConcatOp): + def setUp(self): + super(TestMKLDNNConcatOp, self).setUp() + self.attrs["use_mkldnn"] = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + +class TestMKLDNNConcatOp2(TestConcatOp2): + def setUp(self): + super(TestMKLDNNConcatOp2, self).setUp() + self.attrs["use_mkldnn"] = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + +class TestMKLDNNConcatOp3(TestConcatOp3): + def setUp(self): + super(TestMKLDNNConcatOp3, self).setUp() + self.attrs["use_mkldnn"] = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + + +if __name__ == '__main__': + unittest.main() From 738069e491d5649b39706aed2526622a1594332c Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Mon, 3 Dec 2018 14:45:27 +0100 Subject: [PATCH 114/719] Refactor MKL-DNN Concat test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 209 +++++++-------------- 1 file changed, 72 insertions(+), 137 deletions(-) diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc index c6652b7885..37b2788d63 100644 --- a/paddle/fluid/operators/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" @@ -26,25 +27,6 @@ using mkldnn::concat; using mkldnn::stream; using platform::to_void_cast; -// Generate keys for storing/retriving primitives for this operator -// TODO(jczaja): Make hashing function more optimial -static std::string gethash(const memory::dims& input_dims, - const std::string& pooling_type, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string& suffix) { - auto dims2str = [](const memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) + - dims2str(paddings) + pooling_type + suffix; -} - static void EnforceLayouts(const std::vector inputs) { for (auto* input : inputs) { const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; @@ -56,7 +38,7 @@ static void EnforceLayouts(const std::vector inputs) { } static memory::primitive_desc CreateMemPrimDesc( - const framework::Tensor& input, const mkldnn::engine& engine) { + const Tensor& input, const mkldnn::engine& engine) { constexpr auto data_type = mkldnn::memory::f32; const auto dims = paddle::framework::vectorize2int(input.dims()); const auto format = input.format(); @@ -65,6 +47,11 @@ static memory::primitive_desc CreateMemPrimDesc( return mem_prim_desc; } +static mkldnn::memory::format GetDstMemFormat( + const concat::primitive_desc& concat_pd) { + return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; +} + static platform::CPUPlace GetCpuPlace( const paddle::framework::ExecutionContext& ctx) { auto place = ctx.GetPlace(); @@ -73,139 +60,87 @@ static platform::CPUPlace GetCpuPlace( return boost::get(place); } -template -class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto place = GetCpuPlace(ctx); +static const mkldnn::engine& GetMKLDNNEngine( + const paddle::framework::ExecutionContext& ctx) { auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); + return dev_ctx.GetEngine(); +} - auto multi_input = ctx.MultiInput("X"); - framework::Tensor* output = ctx.Output("Out"); - int64_t concat_axis = static_cast(ctx.Attr("axis")); +template +class ConcatPrimitiveFactory { + public: + concat::primitive_desc CreateConcatPrimDescriptor( + const std::vector multi_input, Tensor* output, + int concat_axis, const mkldnn::engine& mkldnn_engine) { + CreateSourcesDescriptors(multi_input, mkldnn_engine); + auto dst_desc = CreateDstMemDescriptor(output); + return concat::primitive_desc(dst_desc, concat_axis, srcs_pd); + } - EnforceLayouts(multi_input); + concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd, + Tensor* output, platform::CPUPlace place) { + CreateSourcePrimitiveAts(); + auto dst_mem = CreateDstMemory(concat_pd, output, place); + return concat(concat_pd, inputs, dst_mem); + } + + private: + memory::desc CreateDstMemDescriptor(Tensor* output) { + auto dst_dims = paddle::framework::vectorize2int(output->dims()); + return memory::desc(dst_dims, platform::MKLDNNGetDataType(), + memory::format::any); + } + + mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, + Tensor* output, platform::CPUPlace place) { + return memory(concat_pd.dst_primitive_desc(), + output->mutable_data(place)); + } - std::vector srcs_pd; - std::vector srcs; + void CreateSourcesDescriptors(const std::vector multi_input, + const mkldnn::engine& mkldnn_engine) { for (size_t i = 0; i < multi_input.size(); i++) { auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); srcs_pd.push_back(mem_prim_desc); - srcs.push_back(memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); + srcs.push_back(memory(mem_prim_desc, + to_void_cast(multi_input[i]->data()))); } - auto dst_dims = paddle::framework::vectorize2int(output->dims()); - auto dst_desc = memory::desc(dst_dims, mkldnn::memory::f32, memory::format::any); - auto concat_pd = concat::primitive_desc(dst_desc, static_cast(concat_axis), srcs_pd); - auto dst_mem = memory(concat_pd.dst_primitive_desc(), output->mutable_data(place)); + } - std::vector inputs; //= {srcs}; + void CreateSourcePrimitiveAts() { inputs.reserve(srcs.size()); for (size_t i = 0; i < srcs.size(); i++) { inputs.push_back(srcs[i]); } - auto concat_prim = concat(concat_pd, inputs, dst_mem); - - std::vector pipeline; - pipeline.push_back(concat_prim); - stream(stream::kind::eager).submit(pipeline).wait(); // TODO(mgallus): When this is not workin' split into decl and def - - /* - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - - auto input_format = input->format(); - memory::format output_format{memory::format::format_undef}; - - const std::string key = gethash(src_tz, pooling_type, ksize, strides, - paddings, ctx.op().Output("Out")); - const std::string key_pool_p = key + "@pool_p"; - const std::string key_pool_pd = key + "@pool_pd"; - const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; - const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; - const std::string key_pool_workspace_memory = - key + "@pool_workspace_memory"; - - auto pool_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); - if (pool_p == nullptr) { - const std::vector& padding_left_top(paddings); - std::vector padding_right_bottom(paddings); - bool ceil_mode = ctx.Attr("ceil_mode"); - if (ceil_mode) { - CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, - padding_right_bottom); - } - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), input_format); - - auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, - mkldnn::memory::format::any); - - std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, - padding_right_bottom, ksize, pooling_type, - mkldnn_engine, ceil_mode, is_test); - - // save pool_pd into global device context to be referred in backward path - if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); - - auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), - to_void_cast(input_data)); - auto dst_memory = - std::make_shared(pool_pd->dst_primitive_desc(), output_data); - - dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); - dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); - - if (is_test) { - pool_p = std::make_shared(*pool_pd, *src_memory, - *dst_memory); - } else { - std::shared_ptr workspace_memory = - CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); - - // save pool_workspace_memory to be referred in backward path - dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - - pool_p = std::make_shared( - *pool_pd, *src_memory, *dst_memory, *workspace_memory); - } - - dev_ctx.SetBlob(key_pool_p, pool_p); - - output_format = - (memory::format)dst_memory->get_primitive_desc().desc().data.format; - } else { - // Primitives already exist - auto pool_src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); - PADDLE_ENFORCE(pool_src_memory_p != nullptr, - "Fail to find pooling src mem_p in device context"); - auto pool_dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); - PADDLE_ENFORCE(pool_dst_memory_p != nullptr, - "Fail to find pooling dst mem_p in device context"); - pool_src_memory_p->set_data_handle(to_void_cast(input_data)); - pool_dst_memory_p->set_data_handle(output_data); - - output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() - .desc() - .data.format; - } + } + + private: + std::vector srcs_pd; + std::vector srcs; + std::vector inputs; +}; + +template +class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto place = GetCpuPlace(ctx); + const auto& mkldnn_engine = GetMKLDNNEngine(ctx); + + auto multi_input = ctx.MultiInput("X"); + EnforceLayouts(multi_input); + Tensor* output = ctx.Output("Out"); + int64_t concat_axis = static_cast(ctx.Attr("axis")); + + ConcatPrimitiveFactory prim_creator; + auto concat_pd = prim_creator.CreateConcatPrimDescriptor(multi_input, + output, static_cast(concat_axis), mkldnn_engine); + auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); + stream(stream::kind::eager).submit({concat}).wait(); - // push primitive to stream and wait until it's executed - std::vector pipeline{*(pool_p.get())}; - stream(stream::kind::eager).submit(pipeline).wait(); - */ - output->mutable_data(place); output->set_layout(DataLayout::kMKLDNN); - output->set_format((memory::format)dst_mem.get_primitive_desc().desc() - .data.format); + output->set_format(GetDstMemFormat(concat_pd)); } }; } // namespace operators From f2a880421ebdfb6e0c9b2b3809d8ba8449b09ea2 Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Tue, 4 Dec 2018 10:10:02 +0100 Subject: [PATCH 115/719] Fix style @ concat integration and tests test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 33 +++++++++---------- paddle/fluid/operators/concat_op.cc | 33 ++++++++++--------- .../tests/unittests/test_concat_mkldnn_op.py | 2 ++ 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc index 37b2788d63..b8456aac9d 100644 --- a/paddle/fluid/operators/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -30,15 +30,15 @@ using platform::to_void_cast; static void EnforceLayouts(const std::vector inputs) { for (auto* input : inputs) { const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; - const bool is_format_defined = input->format() != - memory::format::format_undef; + const bool is_format_defined = + input->format() != memory::format::format_undef; PADDLE_ENFORCE(is_layout_correct && is_format_defined, "Wrong layout/format set for Input tensor"); } } -static memory::primitive_desc CreateMemPrimDesc( - const Tensor& input, const mkldnn::engine& engine) { +static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, + const mkldnn::engine& engine) { constexpr auto data_type = mkldnn::memory::f32; const auto dims = paddle::framework::vectorize2int(input.dims()); const auto format = input.format(); @@ -48,8 +48,8 @@ static memory::primitive_desc CreateMemPrimDesc( } static mkldnn::memory::format GetDstMemFormat( - const concat::primitive_desc& concat_pd) { - return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; + const concat::primitive_desc& concat_pd) { + return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; } static platform::CPUPlace GetCpuPlace( @@ -61,10 +61,9 @@ static platform::CPUPlace GetCpuPlace( } static const mkldnn::engine& GetMKLDNNEngine( - const paddle::framework::ExecutionContext& ctx) { - auto& dev_ctx = - ctx.template device_context(); - return dev_ctx.GetEngine(); + const paddle::framework::ExecutionContext& ctx) { + auto& dev_ctx = ctx.template device_context(); + return dev_ctx.GetEngine(); } template @@ -89,7 +88,7 @@ class ConcatPrimitiveFactory { memory::desc CreateDstMemDescriptor(Tensor* output) { auto dst_dims = paddle::framework::vectorize2int(output->dims()); return memory::desc(dst_dims, platform::MKLDNNGetDataType(), - memory::format::any); + memory::format::any); } mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, @@ -101,10 +100,10 @@ class ConcatPrimitiveFactory { void CreateSourcesDescriptors(const std::vector multi_input, const mkldnn::engine& mkldnn_engine) { for (size_t i = 0; i < multi_input.size(); i++) { - auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); - srcs_pd.push_back(mem_prim_desc); - srcs.push_back(memory(mem_prim_desc, - to_void_cast(multi_input[i]->data()))); + auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); + srcs_pd.push_back(mem_prim_desc); + srcs.push_back( + memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); } } @@ -134,8 +133,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { int64_t concat_axis = static_cast(ctx.Attr("axis")); ConcatPrimitiveFactory prim_creator; - auto concat_pd = prim_creator.CreateConcatPrimDescriptor(multi_input, - output, static_cast(concat_axis), mkldnn_engine); + auto concat_pd = prim_creator.CreateConcatPrimDescriptor( + multi_input, output, static_cast(concat_axis), mkldnn_engine); auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); stream(stream::kind::eager).submit({concat}).wait(); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 7e58f9cde1..7466107cf2 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" +#include #include #include -#include namespace paddle { namespace operators { @@ -63,18 +63,19 @@ class ConcatOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input_data_type = framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]); - - #ifdef PADDLE_WITH_MKLDNN - if (platform::CanMKLDNNBeUsed(ctx)) { - return framework::OpKernelType(input_data_type, ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } - #endif - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]); + +#ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { @@ -82,9 +83,10 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input tensors of concat operator.").AsDuplicable(); AddOutput("Out", "Output tensor of concat operator."); - AddAttr("use_mkldnn", - "(bool, default false) Indicates if MKL-DNN kernel will be used") - .SetDefault(false); + AddAttr( + "use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); AddAttr("axis", "The axis along which the input tensors will be concatenated.") .SetDefault(0); @@ -101,7 +103,6 @@ Examples: [5,6]] )DOC"); - } }; diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py index c590687a24..0ea44c0e4e 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py @@ -29,6 +29,7 @@ class TestMKLDNNConcatOp(TestConcatOp): def init_kernel_type(self): self.use_mkldnn = True + class TestMKLDNNConcatOp2(TestConcatOp2): def setUp(self): super(TestMKLDNNConcatOp2, self).setUp() @@ -40,6 +41,7 @@ class TestMKLDNNConcatOp2(TestConcatOp2): def init_kernel_type(self): self.use_mkldnn = True + class TestMKLDNNConcatOp3(TestConcatOp3): def setUp(self): super(TestMKLDNNConcatOp3, self).setUp() From 6fdbb365ce8e726fdc22665bd65c9c2e4ae43859 Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Tue, 4 Dec 2018 13:07:51 +0100 Subject: [PATCH 116/719] Include MKL-DNN header to concat op only when flag is set test=develop --- paddle/fluid/operators/concat_op.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 7466107cf2..194f9cf503 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -13,11 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" - -#include #include #include +#ifdef PADDLE_WITH_MKLDNN +#include +#endif + namespace paddle { namespace operators { using framework::Tensor; From 49130f9b8f41cda0ac50e5c57f4b033c260c7541 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Tue, 4 Dec 2018 20:15:52 +0800 Subject: [PATCH 117/719] refine downpour sgd API and adapt to pslib proto desc --- paddle/fluid/framework/CMakeLists.txt | 2 +- .../paddle/fluid/distribute_lookup_table.py | 18 ++++++++ python/paddle/fluid/distributed/downpour.py | 19 +++++--- python/paddle/fluid/distributed/node.py | 45 +++++++++---------- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 9f5631b87c..8556dcbc36 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,7 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib) cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py index 52d9ce75f8..a903257fa9 100644 --- a/python/paddle/fluid/distribute_lookup_table.py +++ b/python/paddle/fluid/distribute_lookup_table.py @@ -15,6 +15,24 @@ LOOKUP_TABLE_TYPE = "lookup_table" +def find_distributed_lookup_table_inputs(program, table_name): + local_vars = program.current_block().vars + inputs = [] + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if table_name == op.input("W")[0]: + inputs.extend([local_vars[name] for name in op.input("Ids")]) + return inputs + +def find_distributed_lookup_table_outputs(program, table_name): + local_vars = program.current_block().vars + outputs = [] + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if table_name == op.input("W")[0]: + outputs.extend([local_vars[name] for name in op.output("Out")]) + return outputs + def find_distributed_lookup_table(program): """ Find distribute lookup table in program. diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 551a471495..3fe4afdbff 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -3,6 +3,8 @@ from .node import DownpourWorker from ..backward import append_backward import ps_pb2 as pslib from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs from google.protobuf import text_format class DownpourSGD(object): @@ -12,21 +14,24 @@ class DownpourSGD(object): self.window_ = window def minimize(self, loss, startup_program=None, - parameter_list=None, no_grad_set=None, - prefetch_slots=None, prefetch_slots_emb=None): + parameter_list=None, no_grad_set=None): params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) table_name = find_distributed_lookup_table(loss.block.program) + prefetch_slots = find_distributed_lookup_table_inputs( + loss.block.program, table_name) + prefetch_slots_emb = find_distributed_lookup_table_outputs( + loss.block.program, table_name) server = DownpourServer() worker = DownpourWorker(self.window_) - server.add_sparse_table(0, learning_rate, + server.add_sparse_table(0, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - server.add_dense_table(1, learning_rate, params, grads) - worker.add_sparse_table(0, learning_rate, + server.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) + worker.add_sparse_table(0, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - worker.add_dense_table(1, learning_rate, params, grads) + worker.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) #ps_param.worker_param.CopyFrom(worker.get_desc()) worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param_str = text_format.MessageToString(ps_param) - return [ps_param_str, worker_skipped_ops] + return [ps_param_str, worker_skipped_ops, text_format.MessageToString(worker.get_desc())] diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 3344bba137..7c9a76efb6 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -16,25 +16,26 @@ class DownpourServer(Server): self.server_ = pslib.ServerParameter() def add_sparse_table(self, table_id, learning_rate, - slot_key, slot_value_var, slot_grad_var): - #table = self.server_.downpour_table_param.add() + slot_key_vars, slot_value_var): table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id - table.type = PS_SPARSE_TABLE + table.type = pslib.PS_SPARSE_TABLE table.accessor.accessor_class = "DownpourFeatureValueAccessor" table.accessor.dense_sgd_param.adam.learning_rate = learning_rate - table.accessor.fea_dim = slot_value_var[0].shape[1] + table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, + slot_value_var[0].shape, 1)) def add_dense_table(self, table_id, learning_rate, param_var, grad_var): - #table = self.server_.downpour_table_param.add() table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id - table.type = PS_DENSE_TABLE + table.type = pslib.PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" table.accessor.sparse_sgd_param.learning_rate = learning_rate - table.accessor.fea_dim = 1 - #table.accessor.fea_dim = reduce(lambda x, y: x.shape, 1 for x in param_var) + fea_dim = 0 + for param in param_var: + fea_dim += reduce(lambda x, y: x * y, param.shape, 1) + table.accessor.fea_dim = fea_dim def get_desc(self): return self.server_ @@ -43,28 +44,24 @@ class DownpourServer(Server): class DownpourWorker(Worker): def __init__(self, window): self.window = window - #self.worker_ = pslib.WorkerParameter().downpour_worker_param - #self.worker_ = pslib.WorkerParameter() self.worker_ = pslib.DownpourTrainerParameter() - #self.worker_.pull_dense_per_batch = window - #self.worker_.push_dense_per_batch = window - #self.worker_.downpour_worker_param.pull_dense_per_batch = window - #self.worker_.downpour_worker_param.push_dense_per_batch = window self.worker_.pull_dense_per_batch = window self.worker_.push_dense_per_batch = window - print(self.worker_) - def add_sparse_table(self, table_id, - slot_keys, slot_value_vars, slot_grad_vars): - #table = self.worker_.sparse_table.add() - table = self.worker_.downpour_worker_param.sparse_table.add() + def add_sparse_table(self, table_id, learning_rate, + slot_key_vars, slot_value_vars): + table = self.worker_.sparse_table.add() table.table_id = table_id - table.slot.extend(slot_keys) - self.worker_.extend([grad.name for grad in slot_grad_vars]) + table.slot_key.extend( + [var.name for var in slot_key_vars]) + table.slot_value.extend( + [var.name for var in slot_value_vars]) + table.slot_gradient.extend( + [var.name + "@GRAD" for var in slot_value_vars]) - def add_dense_table(self, table_id, param_vars, grad_vars): - #table = self.worker_.dense_table.add() - table = self.worker_.downpour_worker_param.dense_table.add() + def add_dense_table(self, table_id, learning_rate, + param_vars, grad_vars): + table = self.worker_.dense_table.add() table.table_id = table_id table.dense_variable_name.extend([p.name for p in param_vars]) table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) From 87eb8b0e28a4b95d8427bd66cbff33f107ad069e Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Tue, 4 Dec 2018 14:44:01 +0100 Subject: [PATCH 118/719] Set cpu only for MKL-DNN concat UTs test=develop --- python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py index 0ea44c0e4e..0f2130f904 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py @@ -22,6 +22,7 @@ class TestMKLDNNConcatOp(TestConcatOp): def setUp(self): super(TestMKLDNNConcatOp, self).setUp() self.attrs["use_mkldnn"] = True + self._cpu_only = True def test_check_grad(self): pass @@ -34,6 +35,7 @@ class TestMKLDNNConcatOp2(TestConcatOp2): def setUp(self): super(TestMKLDNNConcatOp2, self).setUp() self.attrs["use_mkldnn"] = True + self._cpu_only = True def test_check_grad(self): pass @@ -46,6 +48,7 @@ class TestMKLDNNConcatOp3(TestConcatOp3): def setUp(self): super(TestMKLDNNConcatOp3, self).setUp() self.attrs["use_mkldnn"] = True + self._cpu_only = True def test_check_grad(self): pass From 419506f510d258fa858c75a05cdcaa780105deca Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 4 Dec 2018 22:23:01 +0800 Subject: [PATCH 119/719] refine for compile pslib.so --- cmake/external/pslib.cmake | 2 +- cmake/external/pslib_brpc.cmake | 2 +- paddle/fluid/CMakeLists.txt | 1 + paddle/fluid/framework/CMakeLists.txt | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 586f66d6fd..812af5efa2 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -66,7 +66,7 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} ) -ADD_LIBRARY(pslib STATIC IMPORTED GLOBAL) +ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) LIST(APPEND external_project_dependencies pslib) diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 7b4beeae65..92019eef26 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -41,7 +41,7 @@ SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) -SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libps.so) +SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a) SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 6b526f0103..d980b36d9b 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +#add_subdirectory(distributed) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8556dcbc36..6fdc73e93a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,7 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib) +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) From d0c8b9b9b350f774a7b195bf6c807b90b5f895f9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 4 Dec 2018 12:00:28 +0000 Subject: [PATCH 120/719] remove timeout unittest test=develop --- paddle/fluid/framework/tensor.h | 2 +- .../test_eager_deletion_seresnext.py | 27 ------------------- 2 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 3a4c52410e..71e8badd4b 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -153,7 +153,7 @@ class Tensor { void set_layout(const DataLayout layout) { layout_ = layout; } - void clear() { holder_.reset(); } + void clear() { holder_ = nullptr; } const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py deleted file mode 100644 index 2dcdbdb8f1..0000000000 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" - -from test_parallel_executor_seresnext import TestResnet - - -class EagerDeletionTestSEResNext(TestResnet): - pass - - -if __name__ == '__main__': - unittest.main() From 06213b798116f7aadb2ab95f83931c10b67a5942 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Tue, 4 Dec 2018 23:06:42 +0800 Subject: [PATCH 121/719] add hadoop helper function for distributed training --- python/paddle/fluid/async_executor.py | 9 +- .../paddle/fluid/distribute_lookup_table.py | 6 +- python/paddle/fluid/distributed/downpour.py | 45 +++++-- python/paddle/fluid/distributed/helper.py | 24 ++++ python/paddle/fluid/distributed/node.py | 1 - python/paddle/fluid/distributed/ps_pb2.py | 118 ++++++++++-------- 6 files changed, 134 insertions(+), 69 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 2945e6e143..c5863eb9e0 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -150,8 +150,13 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, debug) - def config_ps(self, dist_desc, host_sign_list, node_num, index): - self.executor.config_pslib(dist_desc, host_sign_list, node_num, index) + def config_distributed_nodes(self, dist_opt): + # get total rank + # get rank index + # get iplists + # get hadoop info + return + def start_server(self): self.executor.start_server() diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py index a903257fa9..243d806c41 100644 --- a/python/paddle/fluid/distribute_lookup_table.py +++ b/python/paddle/fluid/distribute_lookup_table.py @@ -21,7 +21,8 @@ def find_distributed_lookup_table_inputs(program, table_name): for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - inputs.extend([local_vars[name] for name in op.input("Ids")]) + inputs.extend( + [local_vars[name] for name in op.input("Ids")]) return inputs def find_distributed_lookup_table_outputs(program, table_name): @@ -30,7 +31,8 @@ def find_distributed_lookup_table_outputs(program, table_name): for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - outputs.extend([local_vars[name] for name in op.output("Out")]) + outputs.extend( + [local_vars[name] for name in op.output("Out")]) return outputs def find_distributed_lookup_table(program): diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 3fe4afdbff..093792d5d6 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -8,30 +8,57 @@ from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_o from google.protobuf import text_format class DownpourSGD(object): + """ + Distributed optimizer of downpour stochastic gradient descent + Standard implementation of Google's Downpour SGD + in Large Scale Distributed Deep Networks + + Args: + learning_rate (float): the learning rate used to update parameters. \ + Can be a float value + Examples: + .. code-block:: python + + downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2) + downpour_sgd.minimize(cost) + """ def __init__(self, learning_rate=0.001, window=1): - # todo(guru4elephant): if optimizer is not None, will warning here + # todo(guru4elephant): add more optimizers here as argument + # todo(guru4elephant): make learning_rate as a variable self.learning_rate_ = learning_rate self.window_ = window - + self.type = "downpour" + def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): - params_grads = sorted(append_backward(loss), key=lambda x:x[0].name) + params_grads = sorted(append_backward( + loss, parameter_list, no_grad_set), key=lambda x:x[0].name) table_name = find_distributed_lookup_table(loss.block.program) prefetch_slots = find_distributed_lookup_table_inputs( loss.block.program, table_name) prefetch_slots_emb = find_distributed_lookup_table_outputs( loss.block.program, table_name) server = DownpourServer() + # window is communication strategy worker = DownpourWorker(self.window_) - server.add_sparse_table(0, self.learning_rate_, + # Todo(guru4elephant): support multiple tables definitions + # currently support one big sparse table + sparse_table_index = 0 + # currently merge all dense parameters into one dense table + dense_table_index = 1 + server.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - server.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) - worker.add_sparse_table(0, self.learning_rate_, + server.add_dense_table(dense_table_index, self.learning_rate_, + params_grads[0], params_grads[1]) + worker.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - worker.add_dense_table(1, self.learning_rate_, params_grads[0], params_grads[1]) + worker.add_dense_table(dense_table_index, self.learning_rate_, + params_grads[0], params_grads[1]) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) - #ps_param.worker_param.CopyFrom(worker.get_desc()) + ps_param.worker_param.CopyFrom(worker.get_desc()) + # Todo(guru4elephant): figure out how to support more sparse parameters + # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param_str = text_format.MessageToString(ps_param) - return [ps_param_str, worker_skipped_ops, text_format.MessageToString(worker.get_desc())] + return [ps_param_str, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 8e079b1e8d..12e2f7f197 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -1,5 +1,27 @@ from mpi4py import MPI +class FileSystem(object): + def __init__(self, fs_type="afs", + uri="afs://tianqi.afs.baidu.com:9902", + user=None, + passwd=None, + hadoop_bin="", + afs_conf=None): + assert user not None + assert passwd not None + assert hadoop_bin not None + fs_client = pslib.FsClientParameter() + if fs_type == "afs": + fs_client.fs_type = pslib.FsApiType.AFS + else: + fs_client.fs_type = pslib.FsApiType.HDFS + fs_client.uri = uri + fs_client.user = user + fs_client.passwd = passwd + fs_client.buffer_size = 0 + fs_client.afs_conf = afs_conf if not afs_conf else "" + + class MPIHelper(object): def __init__(self): self.comm = MPI.COMM_WORLD @@ -18,3 +40,5 @@ class MPIHelper(object): def get_hostname(self): import socket return socket.gethostname() + + diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 7c9a76efb6..b96a15a32f 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -12,7 +12,6 @@ class Worker(object): class DownpourServer(Server): def __init__(self): - #self.server_ = pslib.ServerParameter().downpour_server_param self.server_ = pslib.ServerParameter() def add_sparse_table(self, table_id, learning_rate, diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index 355841aba8..0ef34d6e18 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='ps.proto', package='paddle', syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\xe4\x01\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x02 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x03 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x04 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x05 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') ) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -41,8 +41,8 @@ _TABLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3140, - serialized_end=3192, + serialized_start=3198, + serialized_end=3250, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) @@ -108,8 +108,8 @@ _PSCMDID = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3195, - serialized_end=3512, + serialized_start=3253, + serialized_end=3570, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) @@ -148,8 +148,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3108, - serialized_end=3138, + serialized_start=3166, + serialized_end=3196, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) @@ -197,7 +197,14 @@ _PSPARAMETER = _descriptor.Descriptor( is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( - name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=5, + name='trainer_param', full_name='paddle.PSParameter.trainer_param', index=5, + number=301, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=6, number=501, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, @@ -216,7 +223,7 @@ _PSPARAMETER = _descriptor.Descriptor( oneofs=[ ], serialized_start=21, - serialized_end=249, + serialized_end=307, ) @@ -246,8 +253,8 @@ _WORKERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=251, - serialized_end=332, + serialized_start=309, + serialized_end=390, ) @@ -277,8 +284,8 @@ _SERVERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=334, - serialized_end=415, + serialized_start=392, + serialized_end=473, ) @@ -308,8 +315,8 @@ _DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=417, - serialized_end=496, + serialized_start=475, + serialized_end=554, ) @@ -322,28 +329,28 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( fields=[ _descriptor.FieldDescriptor( name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0, - number=2, type=11, cpp_type=10, label=3, + number=1, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1, - number=3, type=11, cpp_type=10, label=3, + number=2, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='pull_dense_per_batch', full_name='paddle.DownpourTrainerParameter.pull_dense_per_batch', index=2, - number=4, type=5, cpp_type=1, label=1, + number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3, - number=5, type=5, cpp_type=1, label=1, + number=4, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -360,8 +367,8 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=499, - serialized_end=687, + serialized_start=557, + serialized_end=745, ) @@ -412,8 +419,8 @@ _DENSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=689, - serialized_end=812, + serialized_start=747, + serialized_end=870, ) @@ -471,8 +478,8 @@ _SPARSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=814, - serialized_end=936, + serialized_start=872, + serialized_end=994, ) @@ -509,8 +516,8 @@ _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=939, - serialized_end=1073, + serialized_start=997, + serialized_end=1131, ) @@ -568,8 +575,8 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1076, - serialized_end=1221, + serialized_start=1134, + serialized_end=1279, ) @@ -634,8 +641,8 @@ _TABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1224, - serialized_end=1415, + serialized_start=1282, + serialized_end=1473, ) @@ -714,8 +721,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1418, - serialized_end=1787, + serialized_start=1476, + serialized_end=1845, ) @@ -787,8 +794,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1790, - serialized_end=1996, + serialized_start=1848, + serialized_end=2054, ) @@ -832,8 +839,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1998, - serialized_end=2081, + serialized_start=2056, + serialized_end=2139, ) @@ -891,8 +898,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2083, - serialized_end=2184, + serialized_start=2141, + serialized_end=2242, ) @@ -943,8 +950,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2186, - serialized_end=2305, + serialized_start=2244, + serialized_end=2363, ) @@ -1002,8 +1009,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2308, - serialized_end=2533, + serialized_start=2366, + serialized_end=2591, ) @@ -1061,8 +1068,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2536, - serialized_end=2670, + serialized_start=2594, + serialized_end=2728, ) @@ -1099,8 +1106,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2672, - serialized_end=2738, + serialized_start=2730, + serialized_end=2796, ) @@ -1130,8 +1137,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2740, - serialized_end=2799, + serialized_start=2798, + serialized_end=2857, ) @@ -1161,8 +1168,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2801, - serialized_end=2847, + serialized_start=2859, + serialized_end=2905, ) @@ -1206,8 +1213,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2849, - serialized_end=2922, + serialized_start=2907, + serialized_end=2980, ) @@ -1280,12 +1287,13 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2925, - serialized_end=3138, + serialized_start=2983, + serialized_end=3196, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER +_PSPARAMETER.fields_by_name['trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER _PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER _WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER _SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER From 04539d4c5de668f46edade83e3b2c8df0e7ac257 Mon Sep 17 00:00:00 2001 From: chengduo Date: Wed, 5 Dec 2018 11:05:25 +0800 Subject: [PATCH 122/719] Fix clip.py (#14718) * expose square test=develop * fix activation test=develop * Add square API test=develop * add necessary op * code refine * fix API.spec test=develop * fix unit test test=develop * add unit test sparse_grad_clip test=develop * fix API.spec test=develop * remove mac test for test_gradient_clip test=develop * remove selectedrows_mul_tensor test=develop --- paddle/fluid/API.spec | 2 + paddle/fluid/operators/activation_op.cc | 4 +- paddle/fluid/operators/activation_op.h | 113 ++++++++++-- .../elementwise/elementwise_mul_op.h | 32 +++- .../operators/elementwise/elementwise_op.h | 31 ++-- .../get_tensor_from_selected_rows_op.cc | 117 +++++++++++++ .../fluid/operators/merge_selected_rows_op.cc | 72 ++++++++ .../operators/merge_selected_rows_op.cu.cc | 23 +++ .../fluid/operators/merge_selected_rows_op.h | 36 ++++ python/paddle/fluid/clip.py | 8 +- python/paddle/fluid/layers/nn.py | 48 ++++++ .../paddle/fluid/tests/test_gradient_clip.py | 84 --------- .../fluid/tests/unittests/CMakeLists.txt | 3 +- .../test_get_tensor_from_selected_rows_op.py | 65 +++++++ .../tests/unittests/test_gradient_clip.py | 162 ++++++++++++++++++ .../unittests/test_merge_selectedrows_op.py | 73 ++++++++ 16 files changed, 751 insertions(+), 122 deletions(-) create mode 100644 paddle/fluid/operators/get_tensor_from_selected_rows_op.cc create mode 100644 paddle/fluid/operators/merge_selected_rows_op.cc create mode 100644 paddle/fluid/operators/merge_selected_rows_op.cu.cc create mode 100644 paddle/fluid/operators/merge_selected_rows_op.h delete mode 100644 python/paddle/fluid/tests/test_gradient_clip.py create mode 100644 python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_gradient_clip.py create mode 100644 python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9f9dbf0410..2722ea078e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -194,6 +194,8 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 832245371e..9c5b8604f4 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, } #endif return framework::OpKernelType( - framework::ToDataType(ctx.Input(name)->type()), - ctx.GetPlace(), layout, library); + framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout, + library); } class ActivationOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index a0f8c5c14c..87d549678a 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -41,6 +41,12 @@ static std::unordered_set InplaceOpSet = { "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", }; +/* The following operator can be used to process SelectedRows, because the + * output of those operator for zero is zero too. + */ +static std::unordered_set CanBeUsedBySelectedRows = { + "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"}; + static bool IsInplace(std::string op) { return InplaceOpSet.count(op); } template @@ -50,16 +56,38 @@ class ActivationKernel using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - auto& X = detail::Ref(context.Input("X"), - "Cannot get input tensor X, variable name = %s", - context.op().Input("X")); - - auto& Out = detail::Ref(context.Output("Out"), - "Cannot get output tensor Out, variable name = %s", - context.op().Output("Out")); - Out.mutable_data(context.GetPlace()); + auto x_var = context.InputVar("X"); + auto out_var = context.OutputVar("Out"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + context.op().Input("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get output Variable Out, variable name = %s", + context.op().Output("Out")); + + framework::Tensor X, *Out; + + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + X = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var), + "Cannot get input Tensor X, variable name = %s", + context.op().Input("X")); + Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + out_var); + } else { + X = detail::Ref(context.Input("X"), + "Cannot get input Tensor X, variable name = %s", + context.op().Input("X")); + Out = context.Output("Out"); + } + + PADDLE_ENFORCE(Out != nullptr, + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + + Out->mutable_data(context.GetPlace()); auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(Out); + auto out = framework::EigenVector::Flatten(*Out); auto* place = context.template device_context().eigen_device(); Functor functor; @@ -78,14 +106,54 @@ class ActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); + auto out_var = context.InputVar("Out"); + auto out_grad_var = context.InputVar(framework::GradVarName("Out")); + auto x_grad_var = context.OutputVar(framework::GradVarName("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + context.op().Input("Out")); + PADDLE_ENFORCE(out_grad_var != nullptr, + "Cannot get input Variable %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + PADDLE_ENFORCE(x_grad_var != nullptr, + "Cannot get output Variable %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); + + framework::Tensor Out, dOut, *dX; + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + Out = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var), + "Cannot get input Tensor Out, variable name = %s", + context.op().Input("Out")); + dOut = + detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar( + *out_grad_var), + "Cannot get input Tensor %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + x_grad_var); + } else { + Out = detail::Ref(context.Input("Out"), + "Cannot get input Tensor Out, variable name = %s", + context.op().Input("Out")); + dOut = detail::Ref( + context.Input(framework::GradVarName("Out")), + "Cannot get input Tensor %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + dX = context.Output(framework::GradVarName("X")); + } + PADDLE_ENFORCE(dX != nullptr, + "Cannot get output tensor %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); dX->mutable_data(context.GetPlace()); - auto dout = framework::EigenVector::Flatten(*dOut); - auto out = framework::EigenVector::Flatten(*Out); + auto dout = framework::EigenVector::Flatten(dOut); + auto out = framework::EigenVector::Flatten(Out); auto dx = framework::EigenVector::Flatten(*dX); auto* place = context.template device_context().eigen_device(); @@ -96,8 +164,19 @@ class ActivationGradKernel } bool inplace = functor.Inplace(); if (!inplace) { - auto* X = context.Input("X"); - auto x = framework::EigenVector::Flatten(*X); + auto x_var = context.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + framework::Tensor X; + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + X = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var)); + } else { + X = detail::Ref(context.Input("X")); + } + + auto x = framework::EigenVector::Flatten(X); functor(*place, x, out, dout, dx); } else { VLOG(10) << " Inplace activation "; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index dc25bc5710..a8b8a67a11 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -60,15 +60,37 @@ template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); + auto x_var = ctx.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + ctx.op().Input("X")); auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); + + framework::Tensor x, *z; + if (x_var->IsType()) { + PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1, + "For elementwise_op, if X is Sparse, Y must be scalar."); + auto& x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); + x = x_sele.value(); + out_sele->set_rows(x_sele.rows()); + out_sele->set_height(x_sele.height()); + out_sele->mutable_value()->Resize(x_sele.value().dims()); + out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type()); + z = ctx.Output("Out")->mutable_value(); + } else if (x_var->IsType()) { + x = x_var->Get(); + z = ctx.Output("Out"); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + x_var->Type().name()); + } z->mutable_data(ctx.GetPlace()); - if (x->numel() == y->numel()) { - elementwise_mul(ctx, x, y, z); + if (x.numel() == y->numel()) { + elementwise_mul(ctx, &x, y, z); } else { - default_elementwise_mul(ctx, x, y, z); + default_elementwise_mul(ctx, &x, y, z); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 85a7817be9..87bf7c6b15 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of elementwise op should not be null."); - PADDLE_ENFORCE( - ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front()); PADDLE_ENFORCE( ctx->GetInputsVarType("Y").front() == framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front()); - - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); + "The input var's type should be LoDTensor, but the received is %s [%s]", + ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); + } ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc new file mode 100644 index 0000000000..a4ae19d9c1 --- /dev/null +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace operators { + +class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "GetTensorFromSelectedRowsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "GetTensorFromSelectedRowsOp must has output Out."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS, + "The input X's type should be SelectedRows, but the received is %s", + ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front()); + PADDLE_ENFORCE( + ctx->GetOutputsVarType("Out").front() == + framework::proto::VarType::LOD_TENSOR, + "The output Out's type should be LoDTensor, but the received is %s", + ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front()); + + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context()); + } +}; + +class GetTensorFromSelectedRowsKernel { + public: + void operator()(const framework::ExecutionContext &ctx) const { + auto *x = ctx.Input("X"); + auto *out = ctx.Output("Out"); + + out->Resize(x->value().dims()); + out->mutable_data(ctx.GetPlace(), x->value().type()); + framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(), + out); + } +}; + +class GetTensorFromSelectedRowsOpProtoMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input type is SelectedRows."); + AddOutput("Out", "The output type is LoDTensor."); + AddComment( + R"DOC( +GetTensorFromSelectedRows Operator + +GetTensorFromSelectedRows is used to get the tensor from SelectedRows. + +)DOC"); + } +}; + +class GetTensorFromSelectedRowsOpVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const final { + auto out_var_name = op_desc.Output("Out").front(); + auto in_var_name = op_desc.Input("X").front(); + + auto out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto in_var = block->FindRecursiveOrCreateVar(in_var_name); + out_var.SetType(framework::proto::VarType::LOD_TENSOR); + out_var.SetDataType(in_var.GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(get_tensor_from_selected_rows, + ops::GetTensorFromSelectedRowsOp, + ops::GetTensorFromSelectedRowsOpProtoMaker, + ops::GetTensorFromSelectedRowsOpVarTypeInference); + +REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float, + ops::GetTensorFromSelectedRowsKernel, double, + ops::GetTensorFromSelectedRowsKernel, int, + ops::GetTensorFromSelectedRowsKernel, int64_t, + ops::GetTensorFromSelectedRowsKernel); + +#ifdef PADDLE_WITH_CUDA +REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float, + ops::GetTensorFromSelectedRowsKernel, double, + ops::GetTensorFromSelectedRowsKernel, int, + ops::GetTensorFromSelectedRowsKernel, int64_t, + ops::GetTensorFromSelectedRowsKernel); +#endif diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc new file mode 100644 index 0000000000..3c15c83955 --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_selected_rows_op.h" + +namespace paddle { +namespace operators { + +class MergeSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MergeSelectedRowsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MergeSelectedRowsOp should not be null."); + ctx->ShareDim("X", /*->*/ "Out"); + } +}; + +class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input type is SelectedRows, and the selected rows may be " + "duplicated."); + AddOutput("Out", + "The output type is SelectedRows, and the selected rows are not " + "duplicated."); + AddComment( + R"DOC( +MergeSelectedRows Operator. + +MergeSelectedRows is used to merge the duplicated rows of the input. +)DOC"); + } +}; + +class MergeSelectedRowsOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp, + ops::MergeSelectedRowsOpMaker, + ops::MergeSelectedRowsOpInferVarType); + +REGISTER_OP_CPU_KERNEL( + merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc new file mode 100644 index 0000000000..90d5fb3eae --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_selected_rows_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h new file mode 100644 index 0000000000..4c977e94b1 --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class MergeSelectedRowsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + + math::scatter::MergeAdd merge_func; + merge_func(context.template device_context(), *x, out); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 5b8ff0514e..0f7dd531b3 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): "All parameters' 'clip_norm' of a same group should be the same" ) - square = grad * grad + merge_grad = grad + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.merge_selected_rows(grad) + merge_grad = layers.get_tensor_from_selected_rows(merge_grad) + + square = layers.square(merge_grad) local_norm_var = layers.reduce_sum(input=square) context[self.group_name].append(local_norm_var) @@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): new_grad = layers.elementwise_mul( x=grad, y=self.context[group_scale_name]) + return param, new_grad diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 28b8ae895a..4833212d31 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,6 +169,8 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', + 'merge_selected_rows', + 'get_tensor_from_selected_rows', 'lstm', ] @@ -8382,6 +8384,29 @@ def mean(x, name=None): return out +@templatedoc() +def merge_selected_rows(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("merge_selected_rows", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="merge_selected_rows", + inputs={"X": x}, + attrs={}, + outputs={"Out": out}) + return out + + @templatedoc() def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): """ @@ -9034,3 +9059,26 @@ def bilinear_tensor_product(x, # add activation return helper.append_activation(out) + + +@templatedoc() +def get_tensor_from_selected_rows(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper('get_tensor_from_selected_rows', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type='get_tensor_from_selected_rows', + inputs={'X': x}, + outputs={'Out': out}, + attrs={}) + return out diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py deleted file mode 100644 index 266687fcd0..0000000000 --- a/python/paddle/fluid/tests/test_gradient_clip.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import paddle -import paddle.fluid as fluid - -BATCH_SIZE = 128 -CLIP = 1 - -prog = fluid.framework.Program() -with fluid.program_guard(main_program=prog): - image = fluid.layers.data(name='x', shape=[784], dtype='float32') - - hidden1 = fluid.layers.fc(input=image, size=128, act='relu') - hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') - predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') - - label = fluid.layers.data(name='y', shape=[1], dtype='int64') - - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) - -prog_clip = prog.clone() - -avg_cost_clip = prog_clip.block(0).var(avg_cost.name) - -p_g = fluid.backward.append_backward(loss=avg_cost) -p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) - -with fluid.program_guard(main_program=prog_clip): - fluid.clip.set_gradient_clip( - fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) - p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) - -grad_list = [elem[1] for elem in p_g] -grad_clip_list = [elem[1] for elem in p_g_clip] - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=8192), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder(feed_list=[image, label], place=place) -exe.run(fluid.default_startup_program()) - -count = 0 -for data in train_reader(): - count += 1 - if count > 5: - break - out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) - out_clip = exe.run(prog_clip, - feed=feeder.feed(data), - fetch_list=grad_clip_list) - global_norm = 0 - for v in out[1:]: - global_norm += np.sum(np.power(v, 2)) - global_norm = np.sqrt(global_norm) - - global_norm_clip = 0 - for v in out_clip[1:]: - global_norm_clip += np.sum(np.power(v, 2)) - global_norm_clip = np.sqrt(global_norm_clip) - - if not np.isclose( - a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3): - exit(1) -exit(0) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 26035f303e..65f80d368c 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -43,13 +43,14 @@ if(APPLE) list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_program_code) endif(NOT WITH_DISTRIBUTE) - message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") + message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) # TODO: add the unitest back when it fixed list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) + list(REMOVE_ITEM TEST_OPS test_gradient_clip) endif() if(NOT WITH_MKLML) # this op is not support on openblas diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py new file mode 100644 index 0000000000..021b950b3b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py @@ -0,0 +1,65 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid.core as core +import numpy as np +from paddle.fluid.op import Operator + + +class TestGetTensorFromSelectedRows(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_with_place(self, place): + scope = core.Scope() + x_rows = [0, 5, 5, 4, 20] + height = 20 + row_numel = 2 + + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + np_array[1, :] = 2.0 + np_array[2, :] = 3.0 + np_array[3, :] = 4.0 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(x_rows) + x.set_height(height) + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + # initialize input variable Out + out = scope.var("Out").get_tensor() + + op = Operator("get_tensor_from_selected_rows", X="X", Out="Out") + + op.run(scope, place) + + out_array = np.array(out) + self.assertEqual((5, 2), out_array.shape) + assert (out_array == np_array).all() + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py new file mode 100644 index 0000000000..e4b3168ba6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -0,0 +1,162 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + +BATCH_SIZE = 128 +CLIP = 1 + + +def bow_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=True, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestGradientClip(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + self.BATCH_SIZE = 2 + self.train_data = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), + batch_size=self.BATCH_SIZE) + + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_operators(self, place): + prog = fluid.framework.Program() + startup_program = fluid.framework.Program() + with fluid.program_guard( + main_program=prog, startup_program=startup_program): + image = fluid.layers.data(name='x', shape=[784], dtype='float32') + label = fluid.layers.data(name='y', shape=[1], dtype='int64') + + hidden1 = fluid.layers.fc(input=image, size=128, act='relu') + hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') + predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') + + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + + prog_clip = prog.clone() + + avg_cost_clip = prog_clip.block(0).var(avg_cost.name) + + p_g = fluid.backward.append_backward(loss=avg_cost) + p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) + + with fluid.program_guard(main_program=prog_clip): + fluid.clip.set_gradient_clip( + fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) + p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) + + grad_list = [elem[1] for elem in p_g] + grad_clip_list = [elem[1] for elem in p_g_clip] + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=8192), + batch_size=BATCH_SIZE) + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[image, label], place=place) + exe.run(startup_program) + + count = 0 + for data in train_reader(): + count += 1 + if count > 5: + break + out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) + out_clip = exe.run(prog_clip, + feed=feeder.feed(data), + fetch_list=grad_clip_list) + global_norm = 0 + for v in out[1:]: + global_norm += np.sum(np.power(v, 2)) + global_norm = np.sqrt(global_norm) + + global_norm_clip = 0 + for v in out_clip[1:]: + global_norm_clip += np.sum(np.power(v, 2)) + global_norm_clip = np.sqrt(global_norm_clip) + + assert np.isclose( + a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3) + + def check_sparse_gradient_clip(self, place): + prog = fluid.framework.Program() + startup_program = fluid.framework.Program() + with fluid.program_guard( + main_program=prog, startup_program=startup_program): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = bow_net(data, label, len(self.word_dict)) + + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) + sgd_optimizer.minimize(cost) + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + exe.run(startup_program) + + data = next(self.train_data()) + val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0] + self.assertEqual((1, ), val.shape) + print(val) + self.assertFalse(np.isnan(val)) + + def test_operators(self): + self.check_operators(core.CPUPlace()) + + def test_sparse_gradient_clip(self): + for place in self.get_places(): + self.check_sparse_gradient_clip(place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py new file mode 100644 index 0000000000..ce64da0478 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py @@ -0,0 +1,73 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid.core as core +import numpy as np +from paddle.fluid.op import Operator + + +class TestMergeSelectedRows(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_with_place(self, place): + scope = core.Scope() + x_rows = [0, 5, 5, 4, 20] + out_rows = [0, 4, 5, 20] + height = 20 + row_numel = 2 + + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + np_array[1, :] = 2.0 + np_array[2, :] = 3.0 + np_array[3, :] = 4.0 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(x_rows) + x.set_height(height) + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + # initialize input variable Out + out = scope.var("Out").get_selected_rows() + + op = Operator("merge_selected_rows", X="X", Out="Out") + + op.run(scope, place) + + self.assertEqual(out.rows(), out_rows) + self.assertEqual(out.height(), height) + + out_array = np.array(out.get_tensor()) + self.assertEqual((4, 2), out_array.shape) + + assert (out_array[0, :] == 1.0).all() + assert (out_array[1, :] == 4.0).all() + assert (out_array[2, :] == 5.0).all() + assert (out_array[3, :] == 1.0).all() + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + +if __name__ == "__main__": + unittest.main() From 8daf67f90f7001eaf50bd707a90673b9c5af38c1 Mon Sep 17 00:00:00 2001 From: liuhongyu Date: Wed, 5 Dec 2018 11:46:36 +0800 Subject: [PATCH 123/719] fix bugs; test=develop --- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 3 +-- paddle/fluid/platform/dynload/cudnn.h | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 2c9800b886..76e8413001 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -180,8 +180,7 @@ struct CudnnRNNCache { #if CUDNN_VERSION >= 6000 CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( - handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, - CUDNN_LINEAR_INPUT, + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); #else diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index d18030dd76..550fe2edee 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -125,7 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnRNNBackwardWeights); \ __macro(cudnnRNNForwardInference); \ __macro(cudnnDestroyDropoutDescriptor); \ - __macro(cudnnDestroyRNNDescriptor); + __macro(cudnnDestroyRNNDescriptor); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) @@ -166,8 +166,7 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) // APIs in R6 #if CUDNN_VERSION >= 6000 -#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) \ - __macro(cudnnSetRNNDescriptor_v6); +#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6); CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif From e2011f13536a5fe885c63c0a470b1913b2a22b93 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Wed, 5 Dec 2018 12:23:52 +0800 Subject: [PATCH 124/719] test dist ut fixes test=develop (#14706) * test dist ut fixes test=develop * fix cmake * for test --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 7 +++---- python/paddle/fluid/tests/unittests/test_dist_base.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 65f80d368c..61cfdb80af 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -96,13 +96,12 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) # FIXME(typhoonzero): add these tests back - # py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) - # set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) # py_test_modules(test_dist_transformer MODULES test_dist_transformer) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) - # TODO(typhoonzero): make dist test parallel when fix port management issue - set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE) + set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 97e7ee6229..160969c63f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -291,8 +291,8 @@ class TestDistBase(unittest.TestCase): if check_error_log: err_log.close() - sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out)) sys.stderr.write('local_stderr: %s\n' % local_err) + sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out)) return pickle.loads(local_out) From 41c28d54c65ba96694bde28b6862c8028b28f612 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 5 Dec 2018 13:04:10 +0800 Subject: [PATCH 125/719] allow customize kernel selection test=develop --- cmake/operators.cmake | 2 + paddle/fluid/framework/CMakeLists.txt | 5 +- paddle/fluid/framework/op_kernel_type.cc | 54 ++++++++++ paddle/fluid/framework/op_kernel_type.h | 59 +++++----- paddle/fluid/framework/op_registry.h | 130 +++++++++++++++-------- paddle/fluid/framework/operator_test.cc | 46 +++++++- paddle/fluid/operators/conv_mkldnn_op.cc | 14 ++- paddle/fluid/operators/conv_op.cc | 5 +- paddle/fluid/operators/conv_op.h | 2 + 9 files changed, 232 insertions(+), 85 deletions(-) create mode 100644 paddle/fluid/framework/op_kernel_type.cc diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 89726bf985..2ced43f9e6 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -166,6 +166,8 @@ function(op_library TARGET) # Append first implemented MKLDNN activation operator if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") endif() diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c701a2ad63..e4c471d86b 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -118,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) +cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler transfer_scope_cache) + shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -191,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) -cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) +cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc new file mode 100644 index 0000000000..6d4801e4a0 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_kernel_type.h" + +namespace paddle { +namespace framework { + +size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { + int cur_loc = 0; + + int place = key.place_.which(); + cur_loc += OpKernelType::kPlaceBits; + + int data_type = static_cast(key.data_type_) << cur_loc; + cur_loc += OpKernelType::kPrimaryDTypeBits; + + int data_layout = static_cast(key.data_layout_) << cur_loc; + cur_loc += OpKernelType::kLayoutBits; + + int library_type = static_cast(key.library_type_) << cur_loc; + cur_loc += OpKernelType::kLibBits; + + int customized_value = key.customized_type_value_; + PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits)); + customized_value = customized_value << cur_loc; + cur_loc += OpKernelType::kCustomizeBits; + PADDLE_ENFORCE(cur_loc < 64); + + std::hash hasher; + return hasher(place + data_type + data_layout + library_type + + customized_value); +} + +bool OpKernelType::operator==(const OpKernelType& o) const { + return platform::places_are_same_class(place_, o.place_) && + data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && + library_type_ == o.library_type_ && + customized_type_value_ == o.customized_type_value_; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index ac03302189..9edc1a3e15 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -24,54 +24,55 @@ limitations under the License. */ namespace paddle { namespace framework { -struct OpKernelType { - struct Hash { - size_t operator()(const OpKernelType& key) const { - int place = key.place_.which(); - int data_type = static_cast(key.data_type_) << LEFT_SHIFT; - int data_layout = static_cast(key.data_layout_) << (LEFT_SHIFT * 2); - int library_type = static_cast(key.library_type_) - << (LEFT_SHIFT * 3); - - std::hash hasher; - return hasher(place + data_type + data_layout + library_type); - } - }; +class OpKernelType { + public: + constexpr static int kDefaultCustomizedTypeValue = 0; - // place, data_type, library_type kinds less than 2^8 - constexpr static int LEFT_SHIFT = 8; - - proto::VarType::Type data_type_; - DataLayout data_layout_; - platform::Place place_; - LibraryType library_type_; + // In total should be smaller than 64. + constexpr static int kPlaceBits = 4; + constexpr static int kPrimaryDTypeBits = 8; + constexpr static int kLayoutBits = 4; + constexpr static int kLibBits = 4; + constexpr static int kCustomizeBits = 4; OpKernelType(proto::VarType::Type data_type, platform::Place place, DataLayout data_layout = DataLayout::kAnyLayout, - LibraryType library_type = LibraryType::kPlain) + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) : data_type_(data_type), data_layout_(data_layout), place_(place), - library_type_(library_type) {} + library_type_(library_type), + customized_type_value_(customized_type_value) {} OpKernelType(proto::VarType::Type data_type, const platform::DeviceContext& dev_ctx, DataLayout data_layout = DataLayout::kAnyLayout, - LibraryType library_type = LibraryType::kPlain) + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) : data_type_(data_type), data_layout_(data_layout), place_(dev_ctx.GetPlace()), - library_type_(library_type) {} + library_type_(library_type), + customized_type_value_(customized_type_value) {} + + virtual ~OpKernelType() {} + + struct Hash { + size_t operator()(const OpKernelType& key) const; + }; size_t hash_key() const { return Hash()(*this); } - bool operator==(const OpKernelType& o) const { - return platform::places_are_same_class(place_, o.place_) && - data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && - library_type_ == o.library_type_; - } + bool operator==(const OpKernelType& o) const; bool operator!=(const OpKernelType& o) const { return !(*this == o); } + + proto::VarType::Type data_type_; + DataLayout data_layout_; + platform::Place place_; + LibraryType library_type_; + int customized_type_value_; }; inline std::ostream& operator<<(std::ostream& os, diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 0e6e74293c..36673e48c2 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -35,6 +35,7 @@ limitations under the License. */ namespace paddle { namespace framework { + class Registrar { public: // In our design, various kinds of classes, e.g., operators and kernels, @@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor; template inline void RegisterKernelClass(const char* op_type, const char* library_type, - Func func) { + int customized_type_value, Func func) { std::string library(library_type); std::string data_layout = "ANYLAYOUT"; if (library == "MKLDNN") { @@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type, } OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), StringToDataLayout(data_layout), - StringToLibraryType(library_type)); + StringToLibraryType(library_type), customized_type_value); OperatorWithKernel::AllOpKernels()[op_type][key] = func; } @@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor { using KERNEL_TYPE = typename std::tuple_element>::type; - void operator()(const char* op_type, const char* library_type) const { + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; RegisterKernelClass( - op_type, library_type, [](const framework::ExecutionContext& ctx) { + op_type, library_type, customized_type_value, + + [](const framework::ExecutionContext& ctx) { KERNEL_TYPE().Compute(ctx); }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; template struct OpKernelRegistrarFunctor { - void operator()(const char* op_type, const char* library_type) const {} + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} }; // User can register many kernel in one place. The data type could be @@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor { template class OpKernelRegistrar : public Registrar { public: - explicit OpKernelRegistrar(const char* op_type, const char* library_type) { + explicit OpKernelRegistrar(const char* op_type, const char* library_type, + int customized_type_value) { OpKernelRegistrarFunctor func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; @@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx; template class OpKernelRegistrarEx : public Registrar { public: - explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) { + explicit OpKernelRegistrarEx(const char* op_type, const char* library_type, + int customized_type_value) { OpKernelRegistrarFunctorEx func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; template struct OpKernelRegistrarFunctorEx { - void operator()(const char* op_type, const char* library_type) const {} + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} }; template @@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx>::type; - void operator()(const char* op_type, const char* library_type) const { - RegisterKernelClass(op_type, library_type, Functor()); + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { + RegisterKernelClass(op_type, library_type, + customized_type_value, Functor()); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctorEx= size, I + 2, DataTypeAndKernelType...> func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; +// clang-format off /** * check if MACRO is used in GLOBAL NAMESPACE. */ @@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx \ - __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ - #library_type); \ - int TouchOpKernelRegistrar_##op_type##_##library_type() { \ - __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ - return 0; \ +#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \ + place_class, customized_name, \ + customized_type_value, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrar \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ } +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ + REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( \ + op_type, library_type, place_class, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) + #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) -#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...) \ - STATIC_ASSERT_GLOBAL_NAMESPACE( \ - __reg_op_kernel_##op_type##_##library_type##__, \ - "REGISTER_OP_KERNEL_EX must be called in global namespace"); \ - static ::paddle::framework::OpKernelRegistrarEx \ - __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ - #library_type); \ - int TouchOpKernelRegistrar_##op_type##_##library_type() { \ - __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ - return 0; \ +#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ + customized_name, \ + customized_type_value, \ + ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL_EX must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrarEx \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ } #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \ - REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \ - __VA_ARGS__) + REGISTER_OP_KERNEL_EX( \ + op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) -#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ - REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) +#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ + REGISTER_OP_KERNEL_EX( \ + op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) /** * Macro to mark what Operator and Kernel @@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx("scale", "scale of cosine op"); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; @@ -103,11 +105,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; static int cpu_kernel_run_num = 0; +static int cpu_kernel2_run_num = 0; class OpWithKernelTest : public OperatorWithKernel { public: @@ -117,7 +122,10 @@ class OpWithKernelTest : public OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override {} OpKernelType GetExpectedKernelType( const ExecutionContext& ctx) const override { - return OpKernelType(proto::VarType::FP32, ctx.GetPlace()); + int sub_type = ctx.Attr("kernel_sub_type"); + return OpKernelType(proto::VarType::FP32, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, + framework::LibraryType::kPlain, sub_type); } }; @@ -132,6 +140,17 @@ class CPUKernelTest : public OpKernel { } }; +template +class CPUKernel2Test : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + cpu_kernel2_run_num++; + ASSERT_EQ(ctx.op().Input("x"), "IN1"); + ASSERT_EQ(ctx.op().Output("y"), "OUT1"); + } +}; + class OpKernelTestMultiInputsProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: @@ -142,6 +161,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; @@ -189,8 +210,17 @@ class CPUKernalMultiInputsTest : public OpKernel { REGISTER_OP_WITHOUT_GRADIENT( op_with_kernel, paddle::framework::OpWithKernelTest, paddle::framework::OpKernelTestProtoAndCheckerMaker); -REGISTER_OP_CPU_KERNEL(op_with_kernel, - paddle::framework::CPUKernelTest); + +// REGISTER_OP_CPU_KERNEL(op_with_kernel, +// paddle::framework::CPUKernelTest); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + op_with_kernel, CPU, paddle::platform::CPUPlace, DEFAULT_TYPE, 0, + paddle::framework::CPUKernelTest); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + op_with_kernel, CPU, paddle::platform::CPUPlace, SPECIAL, 1, + paddle::framework::CPUKernel2Test); // test with single input TEST(OpKernel, all) { @@ -212,6 +242,16 @@ TEST(OpKernel, all) { ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); op->Run(scope, cpu_place); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0); + + attr = op_desc.mutable_attrs()->Add(); + attr->set_name("kernel_sub_type"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(1); + auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc); + op2->Run(scope, cpu_place); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1); } REGISTER_OP_WITHOUT_GRADIENT( diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 05e268bf6a..ce45dd5841 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -491,8 +491,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvMKLDNNOpKernel); - -REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvMKLDNNGradOpKernel); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 342525be49..9a5dc74034 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; framework::LibraryType library{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready std::string data_format = ctx.Attr("data_format"); @@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; } #endif @@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library); + library, customized_type_value); } void Conv2DOpMaker::Make() { diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index e69814001e..249f308c13 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -27,6 +27,8 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +constexpr int kConvMKLDNNFP32 = 1; +constexpr int kConvMKLDNNINT8 = 2; // Base convolution operator definations for other conv // like operators to reuse the implementation. From 8b2898e20182c166537755dbf80aaad8abc2392f Mon Sep 17 00:00:00 2001 From: liuhongyu Date: Wed, 5 Dec 2018 14:19:03 +0800 Subject: [PATCH 126/719] fix bug of formate; test=develop --- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 76e8413001..dd64cc327f 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -180,13 +180,13 @@ struct CudnnRNNCache { #if CUDNN_VERSION >= 6000 CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( - handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); #else CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, - CUDNN_LINEAR_INPUT, + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_DATA_FLOAT)); #endif From 900765224c812577a382897ac0da0951e3e3df0d Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 5 Dec 2018 14:45:11 +0800 Subject: [PATCH 127/719] fix deallocate bug test=develop --- paddle/fluid/platform/device_context.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index d0a108f905..5444fe3175 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -127,9 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { return retv; } - void deallocate(void* buffer) const override { - allocations_.erase(allocations_.find(buffer)); - } + void deallocate(void* buffer) const override { allocations_.erase(buffer); } void* scratchpad() const override { if (scratch_ == NULL) { From b4db31bad07b72c1e9256df380ccf5915d8d91ec Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 5 Dec 2018 15:03:09 +0800 Subject: [PATCH 128/719] fix test test=develop --- python/paddle/fluid/framework.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index f7c86d19f7..e31b09fc7c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -991,7 +991,6 @@ class Block(object): self.desc = program.desc.block(idx) self.vars = collections.OrderedDict() # var_name --> var self.ops = list() # operator list - self._op_descs = list() self.program = program self.removed_vars = collections.OrderedDict() @@ -1244,15 +1243,11 @@ class Block(object): Returns: Operator: the append Operator. """ + op_desc = self.desc.append_op() + op = Operator(block=self, desc=op_desc, *args, **kwargs) if _in_imperative_mode(): - op_desc = core.OpDesc() - op = Operator(block=self, desc=op_desc, *args, **kwargs) _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) - else: - op_desc = self.desc.append_op() - op = Operator(block=self, desc=op_desc, *args, **kwargs) self.ops.append(op) - self._op_descs.append(op_desc) return op def _insert_op(self, index, *args, **kwargs): From 82d68281c04f4fd1078cb7e8be72bd536b9366be Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 5 Dec 2018 15:13:56 +0800 Subject: [PATCH 129/719] follow comments test=develop --- paddle/fluid/framework/operator_test.cc | 15 ++++++++------- paddle/fluid/operators/conv_op.cc | 5 ++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index 8ba66eb4da..ab14732e4d 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -97,6 +97,8 @@ TEST(OperatorBase, all) { namespace paddle { namespace framework { +static int special_type_value = 1; + class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: void Make() { @@ -211,15 +213,12 @@ REGISTER_OP_WITHOUT_GRADIENT( op_with_kernel, paddle::framework::OpWithKernelTest, paddle::framework::OpKernelTestProtoAndCheckerMaker); -// REGISTER_OP_CPU_KERNEL(op_with_kernel, -// paddle::framework::CPUKernelTest); - -REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( - op_with_kernel, CPU, paddle::platform::CPUPlace, DEFAULT_TYPE, 0, - paddle::framework::CPUKernelTest); +REGISTER_OP_CPU_KERNEL(op_with_kernel, + paddle::framework::CPUKernelTest); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( - op_with_kernel, CPU, paddle::platform::CPUPlace, SPECIAL, 1, + op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME, + paddle::framework::special_type_value, paddle::framework::CPUKernel2Test); // test with single input @@ -241,6 +240,7 @@ TEST(OpKernel, all) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); op->Run(scope, cpu_place); + // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called. ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0); @@ -250,6 +250,7 @@ TEST(OpKernel, all) { attr->set_i(1); auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc); op2->Run(scope, cpu_place); + // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called. ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1); } diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 9a5dc74034..7455b9492f 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -345,6 +345,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; framework::LibraryType library_{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready std::string data_format = ctx.Attr("data_format"); @@ -360,12 +362,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; } #endif return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + layout_, library_, customized_type_value); } } // namespace operators From 77236e33fc0d8b72890b700abbe47c838989baaf Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 26 Nov 2018 05:41:51 +0000 Subject: [PATCH 130/719] init jitkernel --- paddle/fluid/operators/CMakeLists.txt | 1 + .../fluid/operators/jitkernels/CMakeLists.txt | 17 +++ paddle/fluid/operators/jitkernels/README.md | 1 + .../jitkernels/jitcode/CMakeLists.txt | 3 + .../operators/jitkernels/jitcode/jitcode.cc | 15 ++ .../operators/jitkernels/jitcode/jitcode.h | 54 +++++++ .../operators/jitkernels/jitcode_base.cc | 40 +++++ .../fluid/operators/jitkernels/jitcode_base.h | 73 +++++++++ .../fluid/operators/jitkernels/kernel_base.h | 47 ++++++ .../fluid/operators/jitkernels/kernel_key.h | 49 ++++++ paddle/fluid/operators/jitkernels/kernels.cc | 33 ++++ paddle/fluid/operators/jitkernels/kernels.h | 142 ++++++++++++++++++ .../operators/jitkernels/more/CMakeLists.txt | 7 + .../jitkernels/more/mkl/CMakeLists.txt | 3 + .../operators/jitkernels/more/mkl/mkl.cc | 44 ++++++ .../fluid/operators/jitkernels/more/mkl/mkl.h | 55 +++++++ paddle/fluid/operators/jitkernels/more/more.h | 15 ++ .../operators/jitkernels/refer/CMakeLists.txt | 3 + .../fluid/operators/jitkernels/refer/refer.cc | 20 +++ .../fluid/operators/jitkernels/refer/refer.h | 33 ++++ paddle/fluid/operators/jitkernels/registry.h | 134 +++++++++++++++++ paddle/fluid/operators/jitkernels/test.cc | 36 +++++ paddle/fluid/operators/math/CMakeLists.txt | 16 +- paddle/fluid/platform/cpu_info.h | 2 +- 24 files changed, 834 insertions(+), 9 deletions(-) create mode 100644 paddle/fluid/operators/jitkernels/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/README.md create mode 100644 paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/jitcode/jitcode.cc create mode 100644 paddle/fluid/operators/jitkernels/jitcode/jitcode.h create mode 100644 paddle/fluid/operators/jitkernels/jitcode_base.cc create mode 100644 paddle/fluid/operators/jitkernels/jitcode_base.h create mode 100644 paddle/fluid/operators/jitkernels/kernel_base.h create mode 100644 paddle/fluid/operators/jitkernels/kernel_key.h create mode 100644 paddle/fluid/operators/jitkernels/kernels.cc create mode 100644 paddle/fluid/operators/jitkernels/kernels.h create mode 100644 paddle/fluid/operators/jitkernels/more/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/more/mkl/mkl.cc create mode 100644 paddle/fluid/operators/jitkernels/more/mkl/mkl.h create mode 100644 paddle/fluid/operators/jitkernels/more/more.h create mode 100644 paddle/fluid/operators/jitkernels/refer/CMakeLists.txt create mode 100644 paddle/fluid/operators/jitkernels/refer/refer.cc create mode 100644 paddle/fluid/operators/jitkernels/refer/refer.h create mode 100644 paddle/fluid/operators/jitkernels/registry.h create mode 100644 paddle/fluid/operators/jitkernels/test.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8c8dc7026e..3458df1606 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) +add_subdirectory(jitkernels) if(WITH_DISTRIBUTE) add_subdirectory(distributed) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt new file mode 100644 index 0000000000..f073210542 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -0,0 +1,17 @@ + +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) + +cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) + +add_subdirectory(more) +add_subdirectory(refer) + +if(WITH_XBYAK) + add_subdirectory(jitcode) +endif() + +# Debug +message(STATUS "--------${JIT_KERNEL_DEPS}") + +cc_library(jit_kernel SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jitkernels/README.md new file mode 100644 index 0000000000..a0990367ef --- /dev/null +++ b/paddle/fluid/operators/jitkernels/README.md @@ -0,0 +1 @@ +TBD diff --git a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt new file mode 100644 index 0000000000..1a5e457309 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt @@ -0,0 +1,3 @@ + +cc_library(jit_kernel_jitcode SRCS jitcode.cc DEPS jit_kernel_base xbyak) +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc new file mode 100644 index 0000000000..0dd2d049d2 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h new file mode 100644 index 0000000000..c100444766 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/kernels.h" + +#define XBYAK_USE_MMAP_ALLOCATOR +#include "xbyak/xbyak.h" +#include "xbyak/xbyak_util.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace jitcode { + +// Application Binary Interface +constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), + abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), + abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); + +template +class JitCode : public JitBase, public Xbyak::CodeGenerator { + public: + JitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) + : Xbyak::CodeGenerator(code_size, code_ptr) { + this->genCode(); + } + + virtual const char* name() const = 0; + virtual void genCode() = 0; + + const unsigned char* getCodeInternal() override { + const Xbyak::uint8* code = CodeGenerator::getCode(); + return code; + } +}; + +} // namespace jitcode +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.cc b/paddle/fluid/operators/jitkernels/jitcode_base.cc new file mode 100644 index 0000000000..417c4d4b9e --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode_base.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/jitcode_base.h" + +DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); + +namespace paddle { +namespace operators { +namespace jitkernels { + +// refer do not need useme, it would be the last one. +void JitBase::dumpCode(const unsigned char* code) const { + if (code) { + static int counter = 0; + std::ostringstream filename; + filename << "paddle_jitcode_" << name() << "." << counter << ".bin"; + counter++; + std::ofstream fout(filename.str(), std::ios::out); + if (fout.is_open()) { + fout.write(reinterpret_cast(code), getSize()); + fout.close(); + } + } +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h new file mode 100644 index 0000000000..0cd6d3c741 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/platform/macros.h" + +DECLARE_bool(dump_jitcode); + +namespace paddle { +namespace operators { +namespace jitkernels { + +// TODO(TJ): make these functions as virtual of a class + +// Every JitCode should estimate the code size itself +template +size_t CodeSize(Attr attr) { + return 4096; +} + +// Every JitCode should have a condition when to use this JitCode +template +bool UseJitCode(Attr attr) { + return false; +} + +// Every JitCode should have a method to get the key from attribution +template +size_t GetKey(Attr attr); + +template <> +size_t GetKey(int d) { + return d; +} + +class JitBase { + public: + JitBase() = default; + virtual ~JitBase() = default; + virtual const char* name() const = 0; + virtual const unsigned char* getCodeInternal() = 0; + + template + const FUNC getCode() { + const unsigned char* code = this->getCodeInternal(); + if (FLAGS_dump_jitcode) { + this->dumpCode(code); + } + return reinterpret_cast(code); + } + DISABLE_COPY_AND_ASSIGN(JitBase); + + protected: + void dumpCode(const unsigned char* code); +}; + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jitkernels/kernel_base.h new file mode 100644 index 0000000000..bd95a921c5 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernel_base.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; + +// Just for adding to kernel pool without template +class Kernel { + public: + Kernel() = default; + DISABLE_COPY_AND_ASSIGN(Kernel); +}; + +template // TODO(TJ): use tuple +class KernelImpl : public Kernel { + public: + using ELEMENT_TYPE = T; // TODO(TJ): remove me? + KernelImpl() = default; + virtual ~KernelImpl() = default; + + virtual Func GetFunc() { return func; } + virtual bool UseMe(Attr attr) const = 0; + + protected: + Func func{nullptr}; +}; + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_key.h b/paddle/fluid/operators/jitkernels/kernel_key.h new file mode 100644 index 0000000000..e06c2b58da --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernel_key.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +struct KernelKey { + struct Hash { + size_t operator()(const KernelKey& key) const { + int place = key.place_.which(); // less than 2^8 + int type = static_cast(key.type_) << 8; // less than 2^(32-8) + std::hash hasher; + return hasher(place + type); + } + }; + + KernelType type_; + platform::Place place_; + + KernelKey(KernelType type, platform::Place place) + : type_(type), place_(place) {} + size_t hash_key() const { return Hash()(*this); } + + bool operator==(const KernelKey& o) const { + return platform::places_are_same_class(place_, o.place_) && + type_ == o.type_; + } + bool operator!=(const KernelKey& o) const { return !(*this == o); } +}; + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.cc b/paddle/fluid/operators/jitkernels/kernels.cc new file mode 100644 index 0000000000..76f49514ee --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernels.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/kernels.h" +#include // for shared_ptr +#include +#include + +namespace paddle { +namespace operators { +namespace jitkernels { + +// refer do not need useme, it would be the last one. + +KernelPool& KernelPool::Instance() { + static KernelPool g_kernel_pool; + return g_kernel_pool; +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernels.h new file mode 100644 index 0000000000..2792b897d3 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/kernels.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include // for shared_ptr +#include +#include +#include + +#include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jitkernels/kernel_key.h" + +#ifdef PADDLE_WITH_XBYAK +#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" +#endif + +namespace paddle { +namespace operators { +namespace jitkernels { + +template +class JitCodePool { + public: + static JitCodePool& Instance() { + static thread_local JitCodePool g_jit_codes; + return g_jit_codes; + } + + std::shared_ptr Get(size_t key) const { + if (codes_.find(key) == codes_.end()) { + return nullptr; + } + return codes_.at(key); + } + + void Insert(size_t key, const std::shared_ptr& value) { + codes_.insert({key, value}); + } + + private: + JitCodePool() = default; + std::unordered_map> codes_; + + DISABLE_COPY_AND_ASSIGN(JitCodePool); +}; + +// std::tuple +template +struct KernelAttr { + typedef T data_type; + typedef Func return_type; + typedef Attr attr_type; +}; + +class KernelPool { + public: + static KernelPool& Instance(); + + typedef std::unique_ptr KernelPtr; + typedef std::unordered_map, KernelKey::Hash> + KernelMap; + KernelMap& AllKernels() { return pool_; } + + void Insert(const KernelKey& key, KernelPtr value) { + if (pool_.find(key) == pool_.end()) { + pool_.emplace(key, std::vector()); + } + pool_.at(key).emplace_back(std::move(value)); + } + KernelPool() = default; + + private: + KernelMap pool_; + + DISABLE_COPY_AND_ASSIGN(KernelPool); +}; + +// TODO(TJ): create_jitcode; + +// TODO(TJ): make tuple? named KernelAttr +template +Func Get(Attr attr) { + size_t key = GetKey(attr); + auto jitcode = JitCodePool().Instance().Get(key); + if (jitcode) { + return jitcode->template getCode(); + } + +#ifdef PADDLE_WITH_XBYAK +// // jitcode::JitCode is under protection of PADDLE_WITH_XBYAK +// if (std::is_same::value) { +// if (UseJitCode(attr)) { +// std::shared_ptr p(std::make_shared>( +// attr, CodeSize(attr))); +// JitCodePool().Instance().Insert(key, p); +// return p->getCode(); +// } +// } +#endif + + // (KernelKey(type, place), vector) + auto& pool = KernelPool().Instance().AllKernels(); + KernelKey kkey(KT, PlaceType()); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto impls = iter->second; + for (auto impl : impls) { + auto i = std::dynamic_pointer_cast>(impl.get()); + if (i && i->UseMe(attr)) { + return i->GetFunc(); + } + } + } + + // The last implementation should be reference function on CPU + // Every kernel should have refer code. + + // because of test refer should have it's own pool + // PADDLE_ENFORCE_GT(list.size(), 1) << "Should have refer implemtation"; + // const auto& refer = KernelRefer().AllKernels(); + // return refer.Get(); + + return nullptr; +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/CMakeLists.txt b/paddle/fluid/operators/jitkernels/more/CMakeLists.txt new file mode 100644 index 0000000000..84f1811ced --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/CMakeLists.txt @@ -0,0 +1,7 @@ + + +if(WITH_MKLML) + add_subdirectory(mkl) +endif() + +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt new file mode 100644 index 0000000000..94d2487866 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt @@ -0,0 +1,3 @@ + +cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc b/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc new file mode 100644 index 0000000000..88a7d66194 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/more/mkl/mkl.h" +#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace more { +namespace mkl { + +template <> +void VMul(const float* x, const float* y, float* z, int n) { + platform::dynload::vsMul(n, x, y, z); +} + +template <> +void VMul(const double* x, const double* y, double* z, int n) { + platform::dynload::vdMul(n, x, y, z); +} + +} // namespace mkl +} // namespace more +} // namespace jitkernels +} // namespace operators +} // namespace paddle + +namespace mkl = paddle::operators::jitkernels::more::mkl; + +REGISTER_JITKERNEL_MORE(vmul, mkl, mkl::VMulKernel, + mkl::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h new file mode 100644 index 0000000000..7cb4334e50 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace more { +namespace mkl { + +template +void VMul(const T* x, const T* y, T* z, int n); + +// template +// struct VMulTypes{ +// typedef T date_type; +// typedef void (*func)(const T*, const T*, T*, int) func_type; +// typedef int attr_type; +// }; + +template +class VMulKernel + : public KernelImpl { + public: + VMulKernel() { this->func = VMul; } + bool UseMe(int d) const override { + if (std::is_same::value) { + return platform::jit::MayIUse(platform::jit::avx512f) && d > 512; + } else { + return true; + } + } +}; + +} // namespace mkl +} // namespace more +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/more.h b/paddle/fluid/operators/jitkernels/more/more.h new file mode 100644 index 0000000000..ab99fdc05f --- /dev/null +++ b/paddle/fluid/operators/jitkernels/more/more.h @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once diff --git a/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt b/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt new file mode 100644 index 0000000000..8c116e42dc --- /dev/null +++ b/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt @@ -0,0 +1,3 @@ + +cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/refer/refer.cc b/paddle/fluid/operators/jitkernels/refer/refer.cc new file mode 100644 index 0000000000..1f6d384fc2 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/refer/refer.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jitkernels/refer/refer.h" +#include "paddle/fluid/operators/jitkernels/registry.h" + +namespace refer = paddle::operators::jitkernels::refer; + +// REGISTER_JITKERNEL_REFER(vmul, refer::VMul, refer::VMul); diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jitkernels/refer/refer.h new file mode 100644 index 0000000000..be55c30b1e --- /dev/null +++ b/paddle/fluid/operators/jitkernels/refer/refer.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace refer { + +template +void VMul(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +} // namespace refer +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h new file mode 100644 index 0000000000..1d2d47a804 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +// make_unique is supported from c++14 +template +inline std::unique_ptr make_unique(Args&&... args) { + static_assert(!std::is_array::value, "T must not be array"); + return std::unique_ptr(new T(std::forward(args)...)); +} + +template +struct JitKernelRegistrarFunctor; + +template +struct JitKernelRegistrarFunctor { + void operator()(KernelType kt) const {} +}; + +template +struct JitKernelRegistrarFunctor { + using KERNEL_IMPL_TYPE = + typename std::tuple_element>::type; + + void operator()(KernelType kt) const { + KernelKey kkey(kt, PlaceType()); + KernelPool().Instance().Insert( + kkey, std::move(make_unique())); + constexpr auto size = std::tuple_size>::value; + JitKernelRegistrarFunctor + func; + func(kt); + } +}; + +template +class JitKernelRegistrar { + public: + explicit JitKernelRegistrar(KernelType kt) { + JitKernelRegistrarFunctor func; + func(kt); + } +}; + +#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// kernel_type: should be in paddle::operators::jitkernels::KernelType +// place_type: should be one of CPUPlace and GPUPlace in paddle::platform +#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ + "REGISTER_KERNEL_MORE must be called in global namespace"); \ + static ::paddle::operators::jitkernels::JitKernelRegistrar< \ + ::paddle::platform::place_type, __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##__( \ + ::paddle::operators::jitkernels::KernelType::kernel_type) +// TODO(TJ): Add Touch and use me + +#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \ + REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) + +#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ + REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) + +/* +REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); + +// refer must be only one and at least one +REGISTER_JITKERNEL_REFER(vmul, VMul); // Refer need support dtype + +// you can register more implementations and the condition when use it +REGISTER_JITKERNEL_MORE(vmul, mkl::VMUL, UseMe, mkl::VMUL, + UseMe) + +#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// Register a new pass that can be applied on the IR. +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar& \ + __pass_tmp_registrar_##pass_type##__ UNUSED = \ + __pass_registrar_##pass_type##__ + +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ UNUSED = \ + TouchPassRegistrar_##pass_type() +*/ + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc new file mode 100644 index 0000000000..86c6669173 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include // for memcpy +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" +#include "paddle/fluid/platform/port.h" + +constexpr int repeat = 20000; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +TEST(JitKernel, vmul) {} + +TEST(JitKernel, pool) {} diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 63363086ad..db91628d94 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -73,11 +73,11 @@ endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc) -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) -if(WITH_XBYAK) - list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) - list(APPEND JIT_KERNEL_DEPS xbyak) -endif() -cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) -cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) +# set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc) +# set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) +# if(WITH_XBYAK) +# list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) +# list(APPEND JIT_KERNEL_DEPS xbyak) +# endif() +# cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) +# cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index fd31ef77b4..663a9fbf4e 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,7 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { +namespace jit { // remove this namespace typedef enum { isa_any, sse42, From 45bfa70cb8a8123cfa5c32ec7323d616f9192e3d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 3 Dec 2018 12:15:16 +0000 Subject: [PATCH 131/719] complete vmul jit kernel --- .../fluid/operators/jitkernels/CMakeLists.txt | 12 +- paddle/fluid/operators/jitkernels/README.md | 3 + .../operators/jitkernels/jitcode/jitcode.cc | 23 ++++ .../operators/jitkernels/jitcode/jitcode.h | 7 +- .../fluid/operators/jitkernels/jitcode_base.h | 9 +- .../fluid/operators/jitkernels/kernel_base.h | 13 +- paddle/fluid/operators/jitkernels/kernels.cc | 7 +- paddle/fluid/operators/jitkernels/kernels.h | 110 ++++++++------- .../fluid/operators/jitkernels/refer/refer.cc | 3 +- .../fluid/operators/jitkernels/refer/refer.h | 8 ++ paddle/fluid/operators/jitkernels/registry.h | 126 ++++++++++-------- paddle/fluid/operators/jitkernels/test.cc | 78 ++++++++++- 12 files changed, 273 insertions(+), 126 deletions(-) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt index f073210542..6392d82e16 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -1,17 +1,19 @@ +# set(use_jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h) +# file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +# file(APPEND ${pass_file} "\#pragma once\n") +# file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") + + set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) -add_subdirectory(more) add_subdirectory(refer) - +add_subdirectory(more) if(WITH_XBYAK) add_subdirectory(jitcode) endif() -# Debug -message(STATUS "--------${JIT_KERNEL_DEPS}") - cc_library(jit_kernel SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jitkernels/README.md index a0990367ef..3401e9be53 100644 --- a/paddle/fluid/operators/jitkernels/README.md +++ b/paddle/fluid/operators/jitkernels/README.md @@ -1 +1,4 @@ TBD + +# Use me +Add USE_JIT_KERNEL(yourname) to CMakefile. diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc index 0dd2d049d2..8078ace7a8 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc @@ -13,3 +13,26 @@ * limitations under the License. */ #include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" + +namespace paddle { +namespace operators { +namespace jitkernels { + +template <> +size_t GetKey(int d) { + return d; +} + +// template <> +// std::shared_ptr CreateJitCode(int attr) +// { +// if (UseJitCode(attr)) { +// return std::make_shared>(attr, +// CodeSize(attr))); +// } +// return nullptr; +// } + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h index c100444766..7e0b6442ed 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -15,6 +15,7 @@ #pragma once #include +#include "paddle/fluid/operators/jitkernels/jitcode_base.h" #include "paddle/fluid/operators/jitkernels/kernels.h" #define XBYAK_USE_MMAP_ALLOCATOR @@ -31,10 +32,10 @@ constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); -template -class JitCode : public JitBase, public Xbyak::CodeGenerator { +template +class VMulJitCode : public JitBase, public Xbyak::CodeGenerator { public: - JitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) + VMulJitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) : Xbyak::CodeGenerator(code_size, code_ptr) { this->genCode(); } diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h index 0cd6d3c741..a164746561 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -15,6 +15,7 @@ #pragma once #include +#include // for shared_ptr #include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/platform/macros.h" @@ -42,11 +43,6 @@ bool UseJitCode(Attr attr) { template size_t GetKey(Attr attr); -template <> -size_t GetKey(int d) { - return d; -} - class JitBase { public: JitBase() = default; @@ -68,6 +64,9 @@ class JitBase { void dumpCode(const unsigned char* code); }; +template +std::shared_ptr CreateJitCode(Attr attr); + } // namespace jitkernels } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jitkernels/kernel_base.h index bd95a921c5..eeaa0617cb 100644 --- a/paddle/fluid/operators/jitkernels/kernel_base.h +++ b/paddle/fluid/operators/jitkernels/kernel_base.h @@ -25,6 +25,7 @@ typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; class Kernel { public: Kernel() = default; + virtual ~Kernel() = default; DISABLE_COPY_AND_ASSIGN(Kernel); }; @@ -32,16 +33,20 @@ template // TODO(TJ): use tuple class KernelImpl : public Kernel { public: using ELEMENT_TYPE = T; // TODO(TJ): remove me? - KernelImpl() = default; - virtual ~KernelImpl() = default; - - virtual Func GetFunc() { return func; } + virtual Func GetFunc() const { return func; } virtual bool UseMe(Attr attr) const = 0; protected: Func func{nullptr}; }; +template // TODO(TJ): use tuple +class ReferKernel : public KernelImpl { + public: + // Refer code can always be used + bool UseMe(Attr attr) const override { return true; } +}; + } // namespace jitkernels } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.cc b/paddle/fluid/operators/jitkernels/kernels.cc index 76f49514ee..35095220e3 100644 --- a/paddle/fluid/operators/jitkernels/kernels.cc +++ b/paddle/fluid/operators/jitkernels/kernels.cc @@ -21,13 +21,16 @@ namespace paddle { namespace operators { namespace jitkernels { -// refer do not need useme, it would be the last one. - KernelPool& KernelPool::Instance() { static KernelPool g_kernel_pool; return g_kernel_pool; } +ReferKernelPool& ReferKernelPool::Instance() { + static ReferKernelPool g_refer_kernel_pool; + return g_refer_kernel_pool; +} + } // namespace jitkernels } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernels.h index 2792b897d3..866f72cce0 100644 --- a/paddle/fluid/operators/jitkernels/kernels.h +++ b/paddle/fluid/operators/jitkernels/kernels.h @@ -18,22 +18,21 @@ #include #include #include - #include "paddle/fluid/operators/jitkernels/jitcode_base.h" #include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/operators/jitkernels/kernel_key.h" - -#ifdef PADDLE_WITH_XBYAK -#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" -#endif +#include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { namespace jitkernels { +// TODO(TJ): rename file to kernel_pool + template class JitCodePool { public: + JitCodePool() = default; static JitCodePool& Instance() { static thread_local JitCodePool g_jit_codes; return g_jit_codes; @@ -51,13 +50,11 @@ class JitCodePool { } private: - JitCodePool() = default; std::unordered_map> codes_; - DISABLE_COPY_AND_ASSIGN(JitCodePool); }; -// std::tuple +// TODO(TJ): std::tuple template struct KernelAttr { typedef T data_type; @@ -65,76 +62,99 @@ struct KernelAttr { typedef Attr attr_type; }; +typedef std::unique_ptr KernelPtr; +typedef std::unordered_map, KernelKey::Hash> + KernelMap; + class KernelPool { public: static KernelPool& Instance(); - - typedef std::unique_ptr KernelPtr; - typedef std::unordered_map, KernelKey::Hash> - KernelMap; + KernelPool() = default; KernelMap& AllKernels() { return pool_; } - void Insert(const KernelKey& key, KernelPtr value) { if (pool_.find(key) == pool_.end()) { pool_.emplace(key, std::vector()); } pool_.at(key).emplace_back(std::move(value)); } - KernelPool() = default; private: KernelMap pool_; - DISABLE_COPY_AND_ASSIGN(KernelPool); }; -// TODO(TJ): create_jitcode; +// Every kernel should have refer code and it should be used in unit tests, +// so refer kernels should have it's independent kernel pool +class ReferKernelPool { + public: + static ReferKernelPool& Instance(); + ReferKernelPool() = default; + KernelMap& AllKernels() { return pool_; } + void Insert(const KernelKey& key, KernelPtr value) { + if (pool_.find(key) == pool_.end()) { + pool_.emplace(key, std::vector()); + } + pool_.at(key).emplace_back(std::move(value)); + } + + private: + KernelMap pool_; + DISABLE_COPY_AND_ASSIGN(ReferKernelPool); +}; + +// Refer code do not related with attr, and always on CPUPlace +template +inline Func GetRefer() { + auto& ref_pool = ReferKernelPool().Instance().AllKernels(); + KernelKey kkey(KT, platform::CPUPlace()); + auto ref_iter = ref_pool.find(kkey); + PADDLE_ENFORCE(ref_iter != ref_pool.end(), + "Every Kernel should have reference function."); + auto& ref_impls = ref_iter->second; + for (auto& impl : ref_impls) { + auto i = dynamic_cast*>(impl.get()); + if (i) { + return i->GetFunc(); + } + } + return nullptr; +} // TODO(TJ): make tuple? named KernelAttr template Func Get(Attr attr) { - size_t key = GetKey(attr); - auto jitcode = JitCodePool().Instance().Get(key); - if (jitcode) { - return jitcode->template getCode(); + // size_t key = GetKey(attr); + // auto jitcode = JitCodePool().Instance().Get(key); + // if (jitcode) { + // return jitcode->template getCode(); + // } + + if (std::is_same::value && + std::is_same::value) { // TODO(TJ): float move to create + // auto p = CreateJitCode(attr); + // if (p) { + // JitCodePool().Instance().Insert(key, p); + // return p->template getCode(); + // } } -#ifdef PADDLE_WITH_XBYAK -// // jitcode::JitCode is under protection of PADDLE_WITH_XBYAK -// if (std::is_same::value) { -// if (UseJitCode(attr)) { -// std::shared_ptr p(std::make_shared>( -// attr, CodeSize(attr))); -// JitCodePool().Instance().Insert(key, p); -// return p->getCode(); -// } -// } -#endif - - // (KernelKey(type, place), vector) + // pool: (KernelKey(type, place), vector) auto& pool = KernelPool().Instance().AllKernels(); KernelKey kkey(KT, PlaceType()); auto iter = pool.find(kkey); if (iter != pool.end()) { - auto impls = iter->second; - for (auto impl : impls) { - auto i = std::dynamic_pointer_cast>(impl.get()); + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { return i->GetFunc(); } } } - // The last implementation should be reference function on CPU - // Every kernel should have refer code. - - // because of test refer should have it's own pool - // PADDLE_ENFORCE_GT(list.size(), 1) << "Should have refer implemtation"; - // const auto& refer = KernelRefer().AllKernels(); - // return refer.Get(); - - return nullptr; + // The last implementation should be reference function on CPUPlace. + return GetRefer(); } } // namespace jitkernels diff --git a/paddle/fluid/operators/jitkernels/refer/refer.cc b/paddle/fluid/operators/jitkernels/refer/refer.cc index 1f6d384fc2..dbccac896c 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.cc +++ b/paddle/fluid/operators/jitkernels/refer/refer.cc @@ -17,4 +17,5 @@ namespace refer = paddle::operators::jitkernels::refer; -// REGISTER_JITKERNEL_REFER(vmul, refer::VMul, refer::VMul); +REGISTER_JITKERNEL_REFER(vmul, refer::VMulKernel, + refer::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jitkernels/refer/refer.h index be55c30b1e..163c6d73dc 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.h +++ b/paddle/fluid/operators/jitkernels/refer/refer.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -27,6 +28,13 @@ void VMul(const T* x, const T* y, T* z, int n) { } } +template +class VMulKernel + : public ReferKernel { + public: + VMulKernel() { this->func = VMul; } +}; + } // namespace refer } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h index 1d2d47a804..62a0de3641 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -20,6 +20,7 @@ #include "paddle/fluid/operators/jitkernels/kernel_base.h" #include "paddle/fluid/operators/jitkernels/kernels.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/variant.h" // for UNUSED namespace paddle { namespace operators { @@ -32,37 +33,40 @@ inline std::unique_ptr make_unique(Args&&... args) { return std::unique_ptr(new T(std::forward(args)...)); } -template +template struct JitKernelRegistrarFunctor; -template -struct JitKernelRegistrarFunctor { +template +struct JitKernelRegistrarFunctor { void operator()(KernelType kt) const {} }; -template -struct JitKernelRegistrarFunctor { +template +struct JitKernelRegistrarFunctor { using KERNEL_IMPL_TYPE = typename std::tuple_element>::type; void operator()(KernelType kt) const { KernelKey kkey(kt, PlaceType()); - KernelPool().Instance().Insert( - kkey, std::move(make_unique())); + Pool().Instance().Insert(kkey, + std::move(make_unique())); constexpr auto size = std::tuple_size>::value; - JitKernelRegistrarFunctor + JitKernelRegistrarFunctor func; func(kt); } }; -template +template class JitKernelRegistrar { public: explicit JitKernelRegistrar(KernelType kt) { - JitKernelRegistrarFunctor func; + JitKernelRegistrarFunctor func; func(kt); } + void Touch() {} }; #define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ @@ -71,17 +75,40 @@ class JitKernelRegistrar { __test_global_namespace_##uniq_name##__>::value, \ msg) +// Refer always on CPUPlace +#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ + "REGISTER_KERNEL_REFER must be called in global namespace"); \ + static ::paddle::operators::jitkernels::JitKernelRegistrar< \ + ::paddle::operators::jitkernels::ReferKernelPool, \ + ::paddle::platform::CPUPlace, __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ + ::paddle::operators::jitkernels::KernelType::kernel_type); \ + int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ + return 0; \ + } + // kernel_type: should be in paddle::operators::jitkernels::KernelType // place_type: should be one of CPUPlace and GPUPlace in paddle::platform -#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ - "REGISTER_KERNEL_MORE must be called in global namespace"); \ - static ::paddle::operators::jitkernels::JitKernelRegistrar< \ - ::paddle::platform::place_type, __VA_ARGS__> \ - __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##__( \ - ::paddle::operators::jitkernels::KernelType::kernel_type) -// TODO(TJ): Add Touch and use me +#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ + "REGISTER_KERNEL_MORE must be called in global namespace"); \ + extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \ + UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static ::paddle::operators::jitkernels::JitKernelRegistrar< \ + ::paddle::operators::jitkernels::KernelPool, \ + ::paddle::platform::place_type, __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \ + ::paddle::operators::jitkernels::KernelType::kernel_type); \ + int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ + __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \ + .Touch(); \ + return 0; \ + } #define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \ REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) @@ -89,45 +116,28 @@ class JitKernelRegistrar { #define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) -/* -REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); - -// refer must be only one and at least one -REGISTER_JITKERNEL_REFER(vmul, VMul); // Refer need support dtype - -// you can register more implementations and the condition when use it -REGISTER_JITKERNEL_MORE(vmul, mkl::VMUL, UseMe, mkl::VMUL, - UseMe) - -#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ - struct __test_global_namespace_##uniq_name##__ {}; \ - static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ - __test_global_namespace_##uniq_name##__>::value, \ - msg) - -// Register a new pass that can be applied on the IR. -#define REGISTER_PASS(pass_type, pass_class) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __reg_pass__##pass_type, \ - "REGISTER_PASS must be called in global namespace"); \ - static ::paddle::framework::ir::PassRegistrar \ - __pass_registrar_##pass_type##__(#pass_type); \ - int TouchPassRegistrar_##pass_type() { \ - __pass_registrar_##pass_type##__.Touch(); \ - return 0; \ - } \ - static ::paddle::framework::ir::PassRegistrar& \ - __pass_tmp_registrar_##pass_type##__ UNUSED = \ - __pass_registrar_##pass_type##__ - -#define USE_PASS(pass_type) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __use_pass_itself_##pass_type, \ - "USE_PASS must be called in global namespace"); \ - extern int TouchPassRegistrar_##pass_type(); \ - static int use_pass_itself_##pass_type##_ UNUSED = \ - TouchPassRegistrar_##pass_type() -*/ +// REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); + +#define USE_JITKERNEL_REFER(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_refer_CPUPlace_, \ + "USE_JITKERNEL_REFER must be called in global namespace"); \ + extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \ + TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() + +#define USE_KERNEL_MORE(kernel_type, impl_type, place_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_, \ + "USE_JITKERNEL_MORE must be called in global namespace"); \ + extern int \ + TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \ + static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \ + UNUSED = \ + TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() + +#define USE_JITKERNEL_MORE(kernel_type, impl_type) \ + USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace) } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc index 86c6669173..d11c7afe9a 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -19,8 +19,11 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/math/jit_kernel.h" -#include "paddle/fluid/operators/math/jit_kernel_refer.h" +#include "paddle/fluid/operators/jitkernels/kernels.h" +// TODO(TJ): remove me +#include "paddle/fluid/operators/jitkernels/registry.h" + +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" constexpr int repeat = 20000; @@ -31,6 +34,75 @@ inline double GetCurrentUS() { return 1e+6 * time.tv_sec + time.tv_usec; } -TEST(JitKernel, vmul) {} +template +void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), + const T upper = static_cast(20.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +template +void ExpectEQ(const T* target, const T* refer, int n) { + if (std::is_floating_point::value) { + for (int i = 0; i < n; ++i) { + EXPECT_NEAR(target[i], refer[i], 1e-3); + } + } else { + for (int i = 0; i < n; ++i) { + EXPECT_EQ(target[i], refer[i]); + } + } +} + +// TODO(TJ): remove me +USE_JITKERNEL_MORE(vmul, mkl); +USE_JITKERNEL_REFER(vmul); + +TEST(JitKernel, vmul) { + using T = float; + using PlaceType = paddle::platform::CPUPlace; + + namespace jit = paddle::operators::jitkernels; + // TODO(TJ): test more vector size + for (int d = 1; d < 30; ++d) { + auto ref = jit::GetRefer(); + auto tgt = jit::Get(d); + EXPECT_TRUE(ref != nullptr); + EXPECT_TRUE(tgt != nullptr); + + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + + tgt(x_data, y_data, ztgt_data, d); + ref(x_data, y_data, zref_data, d); + ExpectEQ(ztgt_data, zref_data, d); + + // test inplace x + std::copy(x.begin(), x.end(), zref.begin()); + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ref(zref_data, y_data, zref_data, d); + ExpectEQ(ztgt_data, zref_data, d); + + // test inplace y + std::copy(y.begin(), y.end(), zref.begin()); + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ref(x_data, zref_data, zref_data, d); + ExpectEQ(ztgt_data, zref_data, d); + } +} TEST(JitKernel, pool) {} From d3ca359e445884ffca4b147607607517aad4791b Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 5 Dec 2018 19:30:37 +0800 Subject: [PATCH 132/719] config init & adapt to interface --- paddle/fluid/framework/async_executor.cc | 55 +++++++++++++++++-- paddle/fluid/framework/async_executor.h | 3 +- .../fluid/framework/executor_thread_worker.cc | 44 ++++++++------- .../fluid/framework/executor_thread_worker.h | 15 +++-- 4 files changed, 85 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 94ed8c2fca..292b05c588 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -67,21 +67,63 @@ void PrepareReaders(std::vector>& readers, // NOLINT void AsyncExecutor::ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO + _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO done } void AsyncExecutor::StartServer() { + InitParamConfig(); _pslib_ptr->run_server(); } +void AsyncExecutor::InitParamConfig() { + _param_config.fea_dim = _pslib_ptr->get_param()->trainer_param().sparse_table(0).feature_dim(); //TODO + _param_config.slot_dim = _param_config.fea_dim - 2; //TODO + _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().pull_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + //sparse + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); + std::vector tmp_sparse_variable_name; + for (int i = 0u; i < table.slot_value_size(); ++i) { + tmp_sparse_variable_name.push_back(table.slot_value(i)); + _param_config.slot_alias_to_table[table.slot_value(i)] = table.table_id(); + } + std::vector tmp_sparse_gradient_variable_name; + for (auto i = 0u; i < table.slot_gradient_size(); ++i) { + tmp_sparse_gradient_variable_name.push_back( + table.slot_gradient(i)); + } + _param_config.slot_input_vec[table.table_id()] = std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = std::move(tmp_sparse_gradient_variable_name); + _param_config.sparse_table_id.push_back(table.table_id()); + } + //dense + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); + std::vector tmp_dense_variable_name; + for (int i = 0u; i < table.dense_variable_name_size(); ++i) { + tmp_dense_variable_name.push_back(table.dense_variable_name(i)); + } + std::vector tmp_dense_gradient_variable_name; + for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { + tmp_dense_gradient_variable_name.push_back( + table.dense_gradient_variable_name(i)); + } + _param_config.dense_variable_name[table.table_id()] = std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = std::move(tmp_dense_gradient_variable_name); + _param_config.dense_table_id.push_back(table.table_id()); + _param_config.dense_table_size.push_back(table.fea_dim()); //TODO + } +} + void AsyncExecutor::InitModel() { //TODO only rank = 0 do this - std::vector all_dense_table_id; //TODO - all_dense_table_id.push_back(0); - for (auto table_id: all_dense_table_id) { + //std::vector all_dense_table_id; //TODO + //all_dense_table_id.push_back(0); //done + for (auto table_id: _param_config.dense_table_id) { std::vector regions; - std::vector variables; //TODO - for (auto& t : variables) { + //std::vector variables; //TODO + for (auto& t : _param_config.dense_variable_name[table_id]) { Variable* var = root_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); @@ -131,6 +173,7 @@ void AsyncExecutor::PrepareDenseThread() { param.training_thread_num = actual_thread_num; param.root_scope = root_scope_; //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO + param.dense_params = &_param_config.dense_variable_name; _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 67f4e5deee..21e4a66fce 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -68,7 +68,7 @@ class AsyncExecutor { void StartServer(); void InitModel(); void SaveModel(const std::string& path); - + void InitParamConfig(); private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -86,6 +86,7 @@ class AsyncExecutor { AsyncWorkerParamConfig _param_config; private: int actual_thread_num; + }; diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 19d8818be7..f7c05e400d 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -382,33 +382,38 @@ void AsyncExecutorThreadWorker::BindingSlotVariableMemory() { } */ } -void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* pc) { - _param_config = pc; + +void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* param_config) { + _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - int table_id = 0; //TODO - PullSparse(table_id); - for (auto& t : _pull_sparse_status) { - t.wait(); - auto status = t.get(); - if (status != 0) { - LOG(ERROR) << "pull sparse failed, status[" << status << "]"; - exit(-1); + //int table_id = 0; //TODO + for (auto table_id: _param_config->sparse_table_id) { + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } } } _pull_sparse_status.resize(0); - FillSparse(table_id); + for (auto table_id: _param_config->sparse_table_id) { + FillSparse(table_id); + } } void AsyncExecutorThreadWorker::UpdateParams() { - //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO - for (int i = 0; i < 1; ++i) { + for (auto i: _param_config->sparse_table_id) {//TODO + //for (int i = 0; i < 1; ++i) { PushSparse(i); } //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO - for (int i = 1; i < 2; ++i) { + for (auto i: _param_config->dense_table_id) { PushDense(i); } int32_t tmp_push_dense_wait_times = _param_config->tmp_push_dense_wait_times; //TODO @@ -437,14 +442,13 @@ void AsyncExecutorThreadWorker::UpdateParams() { } //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO - int dense_table_id = 1; + for (auto dense_table_id: _param_config->dense_table_id) { _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + } //} } void AsyncExecutorThreadWorker::PushDense(int table_id) { - //auto table_id = GlobalConfig::instance().dense_table_id[table_id_index]; TODO - std::vector regions; //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; std::vector variables; @@ -529,7 +533,7 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { int64_t* ids = tensor->data(); int len = tensor->numel(); - Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[slot_idx - 1]); + Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[table_id][slot_idx - 1]); LoDTensor* tensor_emb = var_emb->GetMutable(); float* ptr = tensor_emb->data(); @@ -575,10 +579,10 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { continue; } - Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[slot_idx - 1]); + Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); LoDTensor* g_tensor = g_var->GetMutable(); //int count = g_tensor->numel(); float* g = g_tensor->data(); diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 63f383cd47..4e3255a590 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -40,8 +40,14 @@ struct AsyncWorkerParamConfig { int32_t tmp_push_dense_wait_times; int32_t tmp_push_sparse_wait_times; - std::vector slot_input_vec; //6048slot 6050slot //name - std::vector gradient_var; //6048slot_embed + std::map> dense_variable_name; + std::map> dense_gradient_variable_name; + std::vector dense_table_id; + std::vector dense_table_size; // fea_dim for each dense table + std::vector sparse_table_id; + std::map> slot_input_vec; //6048slot 6050slot //name + std::map> gradient_var; //6048slot_embed + std::unordered_map slot_alias_to_table; //TODO done }; struct DensePullThreadParam { @@ -148,7 +154,7 @@ class ExecutorThreadWorker { virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); virtual void SetPullDenseThread(std::shared_ptr dpt) {}; virtual void BindingSlotVariableMemory() {}; - virtual void SetParamConfig(AsyncWorkerParamConfig* pc) {}; + virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}; private: void CreateThreadScope(const framework::ProgramDesc& program); void CreateThreadOperators(const framework::ProgramDesc& program); @@ -184,7 +190,7 @@ public: void SetPSlibPtr(std::shared_ptr pslib_ptr); void SetPullDenseThread(std::shared_ptr dpt); void BindingSlotVariableMemory(); - void SetParamConfig(AsyncWorkerParamConfig* pc); + void SetParamConfig(AsyncWorkerParamConfig* param_config); void TrainFiles(); void TrainOneNetwork(); void PrepareParams(); @@ -209,7 +215,6 @@ private: std::map>> _feature_value; std::map>> _feature_push_value; - std::unordered_map _slot_alias_to_table; //TODO std::shared_ptr _pslib_ptr; From 4a93db928834f5dc2e9090c1151774acaae44d1b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Dec 2018 12:07:03 +0000 Subject: [PATCH 133/719] remove jit namespace test=develop --- paddle/fluid/operators/attention_lstm_op.cc | 16 +- .../fused/fused_embedding_fc_lstm_op.cc | 6 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 6 +- paddle/fluid/operators/math/cpu_vec.h | 148 +++++++++--------- paddle/fluid/operators/math/cpu_vec_test.cc | 54 ++++--- paddle/fluid/operators/math/jit_code.cc | 2 +- paddle/fluid/operators/math/jit_code.h | 2 +- paddle/fluid/operators/math/jit_gen.cc | 2 +- paddle/fluid/operators/math/jit_kernel.cc | 2 - .../fluid/operators/math/jit_kernel_blas.cc | 3 +- .../operators/math/jit_kernel_crf_decode.cc | 24 ++- paddle/fluid/operators/math/jit_kernel_exp.cc | 1 - .../operators/math/jit_kernel_layer_norm.cc | 22 ++- .../fluid/operators/math/jit_kernel_macro.h | 37 +++-- .../fluid/operators/math/jit_kernel_test.cc | 2 +- paddle/fluid/platform/cpu_info.cc | 2 - paddle/fluid/platform/cpu_info.h | 3 - paddle/fluid/platform/init.cc | 14 +- 18 files changed, 167 insertions(+), 179 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 9b943440a8..75fc59125f 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + math::vec_relu(n, x, y); } } @@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { @@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 6d463538d2..1eb6523a2d 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); \ auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ + if (platform::MayIUse(platform::avx)) { \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 288b56fc24..17ed9771d0 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 7d81aee596..e1e4d168db 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -77,7 +77,7 @@ inline void vec_scal(const int n, const double a, double* x) { #endif // MKL scal only support inplace, choose this if src and dst are not equal -template +template inline void vec_scal(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a * x[i]; @@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); return; } const int rest = n % block; @@ -114,24 +114,24 @@ inline void vec_scal(const int n, const float a, y[i] = a * x[i]; } #else - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); #endif } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { - vec_scal(n, a, x, y); +inline void vec_scal(const int n, const float a, + const float* x, float* y) { + vec_scal(n, a, x, y); } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); } -template +template inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a - x[i]; @@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); return; } const int rest = n % block; @@ -168,27 +168,25 @@ inline void vec_bias_sub(const int n, const float a, y[i] = a - x[i]; } #else - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); #endif } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { - vec_bias_sub(n, a, x, y); +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { + vec_bias_sub(n, a, x, y); } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); } // out = x*y + (1-x)*z -template +template inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { for (int i = 0; i < n; ++i) { out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; @@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { } template <> -inline void vec_cross(const int n, const float* x, - const float* y, const float* z, - float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); return; } const int rest = n % block; @@ -228,25 +226,26 @@ inline void vec_cross(const int n, const float* x, out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; } #else - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); #endif } template <> -inline void vec_cross(const int n, const float* x, - const float* y, - const float* z, float* out) { - vec_cross(n, x, y, z, out); +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { + vec_cross(n, x, y, z, out); } template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { // TODO(TJ): enable me - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); } -template +template inline void vec_add_bias(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] + a; @@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); return; } const int rest = n % block; @@ -283,32 +282,30 @@ inline void vec_add_bias(const int n, const float a, y[i] = x[i] + a; } #else - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); #endif } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { - vec_add_bias(n, a, x, y); +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { + vec_add_bias(n, a, x, y); } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); } -template +template inline void vec_identity(const int n, const T* x, T* y) { // do nothing return; } -template +template inline void vec_sigmoid(const int n, const T* x, T* y) { const T min = SIGMOID_THRESHOLD_MIN; const T max = SIGMOID_THRESHOLD_MAX; @@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); return; } const int rest = n % block; @@ -377,25 +374,24 @@ inline void vec_sigmoid(const int n, const float* x, y[i] = 1.f / (1.f + y[i]); } #else - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); #endif } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { - vec_sigmoid(n, x, y); +inline void vec_sigmoid(const int n, const float* x, + float* y) { + vec_sigmoid(n, x, y); } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); } -template +template inline void vec_tanh(const int n, const T* x, T* y) { vec_scal(n, static_cast(2), x, y); vec_sigmoid(n, y, y); @@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { } // TODO(TJ): make relu clip -template +template inline void vec_relu(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] > 0 ? x[i] : 0; @@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) { } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block * 4) { - vec_relu(n, x, y); + vec_relu(n, x, y); return; } @@ -441,26 +437,26 @@ inline void vec_relu(const int n, const float* x, #undef MOVE_ONE_STEP #else - vec_relu(n, x, y); + vec_relu(n, x, y); #endif } template <> -inline void vec_relu(const int n, const float* x, - float* y) { - vec_relu(n, x, y); +inline void vec_relu(const int n, const float* x, + float* y) { + vec_relu(n, x, y); } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_relu(n, x, y); + vec_relu(n, x, y); } // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary -template +template class VecActivations { public: std::function operator()( diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index c37fa291a2..28eb9cadc9 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function tgt, } TEST(CpuVecTest, sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, + ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, + ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -162,38 +166,40 @@ void TestInplace(const int n, std::function tgt, } TEST(CpuVecTest, inplace_sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, inplace_tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, inplace_relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 52cbdf685d..78d0c3e880 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -22,7 +22,7 @@ namespace math { namespace jitkernel { namespace gen { -using namespace platform::jit; // NOLINT +using namespace platform; // NOLINT bool VXXJitCode::init(int d, int scalar_index) { // It's not necessary to use avx512 since it would slow down the frequency diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index a921462129..e2b4761435 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -179,7 +179,7 @@ class VActJitCode : public JitCode { template void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { - using namespace platform::jit; // NOLINT + using namespace platform; // NOLINT // check all idx can not equal JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc index 6af39518ed..5c6672928e 100644 --- a/paddle/fluid/operators/math/jit_gen.cc +++ b/paddle/fluid/operators/math/jit_gen.cc @@ -36,7 +36,7 @@ void JitCode::preCode() { for (int i = 0; i < num_g_abi_regs; ++i) { push(Xbyak::Reg64(g_abi_regs[i])); } - if (platform::jit::MayIUse(platform::jit::avx512f)) { + if (platform::MayIUse(platform::avx512f)) { mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); } } diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc index 68b708b345..118696ba47 100644 --- a/paddle/fluid/operators/math/jit_kernel.cc +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -21,8 +21,6 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - KernelPool& KernelPool::Instance() { static thread_local KernelPool g_jit_kernels; return g_jit_kernels; diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index a0f93fd8e7..8cf588efba 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -30,7 +30,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML template @@ -125,7 +124,7 @@ bool VMulKernelImpl::useJIT(int d) { #ifdef PADDLE_WITH_MKLML template <> bool VMulKernelImpl::useMKL(int d) { - return jit::MayIUse(jit::avx512f) && d > 512; + return platform::MayIUse(platform::avx512f) && d > 512; } template <> diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index 4d26b81948..eeb305a88b 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -25,10 +25,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* CRF Decode JitKernel */ -template +template class CRFDecodeKernelImpl : public CRFDecodeKernel { public: explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel() { @@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(YMM_FLOAT_BLOCK) \ @@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX512_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(ZMM_FLOAT_BLOCK) \ @@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16); INTRIAVX_FLOAT(kGT16); #endif #ifdef __AVX2__ -INTRIAVX2_FLOAT(jit::avx2, kEQ8); -INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX2_FLOAT(jit::avx2, kEQ16); -INTRIAVX2_FLOAT(jit::avx2, kGT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ8); +INTRIAVX2_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ16); +INTRIAVX2_FLOAT(platform::avx2, kGT16); #endif #ifdef __AVX512F__ -INTRIAVX2_FLOAT(jit::avx512f, kEQ8); -INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx512f, kEQ8); +INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16); INTRIAVX512_FLOAT(kEQ16); INTRIAVX512_FLOAT(kGT16); #endif diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 686f3dd983..7945cfb253 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -29,7 +29,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index 49904e6e8c..fead13ebad 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -22,10 +22,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* Layer Norm JitKernel */ -template +template class LayerNormKernelImpl : public LayerNormKernel { public: explicit LayerNormKernelImpl(int right) : LayerNormKernel() { @@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel { this->end_ = this->num_ - this->rest_; \ } \ template <> \ - void LayerNormKernelImpl::Compute( \ + void LayerNormKernelImpl::Compute( \ float* x, float* out, float* mean, float* var, const float* scale, \ const float* bias, int height, const float epsilon) const { \ __m256 sum; \ @@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel { } #ifdef __AVX__ -INTRIAVX_FLOAT(jit::avx, kEQ8); -INTRIAVX_FLOAT(jit::avx, kGT8LT16); -INTRIAVX_FLOAT(jit::avx, kEQ16); -INTRIAVX_FLOAT(jit::avx, kGT16); +INTRIAVX_FLOAT(platform::avx, kEQ8); +INTRIAVX_FLOAT(platform::avx, kGT8LT16); +INTRIAVX_FLOAT(platform::avx, kEQ16); +INTRIAVX_FLOAT(platform::avx, kGT16); #endif #ifdef __AVX2__ -INTRIAVX_FLOAT(jit::avx2, kEQ8); -INTRIAVX_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX_FLOAT(jit::avx2, kEQ16); -INTRIAVX_FLOAT(jit::avx2, kGT16); +INTRIAVX_FLOAT(platform::avx2, kEQ8); +INTRIAVX_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX_FLOAT(platform::avx2, kEQ16); +INTRIAVX_FLOAT(platform::avx2, kGT16); #endif #undef INTRIAVX_FLOAT diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 5a3efd979f..4dba3b5681 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -92,7 +92,6 @@ namespace jitkernel { JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ JITKERNEL_IMPL) -namespace jit = platform::jit; // TODO(TJ): below defines are deprecated, would be remove recently #define SEARCH_BLOCK(macro_, ker, dtype, isa) \ if (d < YMM_FLOAT_BLOCK) { \ @@ -107,15 +106,15 @@ namespace jit = platform::jit; macro_(ker, dtype, isa, kGT16); \ } -#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ - if (jit::MayIUse(jit::avx512f)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ - } else if (jit::MayIUse(jit::avx2)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ - } else if (jit::MayIUse(jit::avx)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ - } else { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (platform::MayIUse(platform::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \ + } else if (platform::MayIUse(platform::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx2); \ + } else if (platform::MayIUse(platform::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \ } #define JITKERNEL_KEY(ker_key, dtype_key) \ @@ -156,10 +155,10 @@ namespace jit = platform::jit; marco_declare, macro_key, macro_impl) #define FOR_EACH_ISA(macro_, block) \ - macro_(jit::avx512f, block); \ - macro_(jit::avx2, block); \ - macro_(jit::avx, block); \ - macro_(jit::isa_any, block) + macro_(platform::avx512f, block); \ + macro_(platform::avx2, block); \ + macro_(platform::avx, block); \ + macro_(platform::isa_any, block) #define FOR_EACH_BLOCK(macro_, isa) \ macro_(isa, kLT8); \ @@ -168,11 +167,11 @@ namespace jit = platform::jit; macro_(isa, kEQ16); \ macro_(isa, kGT16) -#define FOR_EACH_ISA_BLOCK(macro_) \ - FOR_EACH_BLOCK(macro_, jit::avx512f); \ - FOR_EACH_BLOCK(macro_, jit::avx2); \ - FOR_EACH_BLOCK(macro_, jit::avx); \ - FOR_EACH_BLOCK(macro_, jit::isa_any) +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, platform::avx512f); \ + FOR_EACH_BLOCK(macro_, platform::avx2); \ + FOR_EACH_BLOCK(macro_, platform::avx); \ + FOR_EACH_BLOCK(macro_, platform::isa_any) } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index ed86a47e15..19f7bd8909 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -705,7 +705,7 @@ TEST(JitKernel, pool) { jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); // empty call it to avoid unknown flag 'use_pinned_memory' on Mac - paddle::platform::jit::MayIUse(paddle::platform::jit::avx); + paddle::platform::MayIUse(paddle::platform::avx); const auto& plstm1 = jit::KernelPool::Instance() .template Get, const jit::lstm_attr_t&>(attr); diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index d466f28d1e..f9a32bfa4c 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } -namespace jit { #ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { @@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) { } #endif -} // namespace jit } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index fd31ef77b4..55dba545ff 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { typedef enum { isa_any, sse42, @@ -55,7 +54,5 @@ typedef enum { // May I use some instruction bool MayIUse(const cpu_isa_t cpu_isa); -} // namespace jit - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 51b46450e4..0d10d82d74 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) - if (platform::jit::MayIUse(platform::jit::avx)) { + if (platform::MayIUse(platform::avx)) { #ifndef __AVX__ LOG(WARNING) << "AVX is available, Please re-compile on local machine"; #endif @@ -131,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector devices) { " version or compile from source code." #ifdef __AVX512F__ - if (!platform::jit::MayIUse(platform::jit::avx512f)) { - if (platform::jit::MayIUse(platform::jit::avx2)) { + if (!platform::MayIUse(platform::avx512f)) { + if (platform::MayIUse(platform::avx2)) { AVX_GUIDE(AVX512, AVX2); - } else if (platform::jit::MayIUse(platform::jit::avx)) { + } else if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX512, AVX); } else { AVX_GUIDE(AVX512, NonAVX); @@ -143,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX2__ - if (!platform::jit::MayIUse(platform::jit::avx2)) { - if (platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx2)) { + if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX2, AVX); } else { AVX_GUIDE(AVX2, NonAVX); @@ -153,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX__ - if (!platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX, NonAVX); } #endif From 191948c933a15c41315228ffbdc70eb59fdb8f55 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Dec 2018 07:15:08 +0000 Subject: [PATCH 134/719] enable jitcode --- .../fluid/operators/jitkernels/CMakeLists.txt | 2 +- .../jitkernels/jitcode/CMakeLists.txt | 4 +- .../operators/jitkernels/jitcode/blas.cc | 118 ++++++++++++++++++ .../fluid/operators/jitkernels/jitcode/blas.h | 88 +++++++++++++ .../operators/jitkernels/jitcode/jitcode.h | 95 ++++++++++++-- .../operators/jitkernels/jitcode_base.cc | 5 +- .../fluid/operators/jitkernels/jitcode_base.h | 19 +-- paddle/fluid/operators/jitkernels/kernels.h | 59 ++++----- paddle/fluid/platform/cpu_info.h | 2 +- 9 files changed, 342 insertions(+), 50 deletions(-) create mode 100644 paddle/fluid/operators/jitkernels/jitcode/blas.cc create mode 100644 paddle/fluid/operators/jitkernels/jitcode/blas.h diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt index 6392d82e16..e82e6c3026 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -7,7 +7,7 @@ set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) -cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) +cc_library(jit_kernel_base SRCS kernels.cc jitcode_base.cc DEPS ${JIT_KERNEL_DEPS}) add_subdirectory(refer) add_subdirectory(more) diff --git a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt index 1a5e457309..c678ea33b8 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt @@ -1,3 +1,5 @@ -cc_library(jit_kernel_jitcode SRCS jitcode.cc DEPS jit_kernel_base xbyak) +file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") + +cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.cc b/paddle/fluid/operators/jitkernels/jitcode/blas.cc new file mode 100644 index 0000000000..2691bee0fd --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/blas.cc @@ -0,0 +1,118 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ +#include "paddle/fluid/operators/jitkernels/jitcode/blas.h" +#include "paddle/fluid/operators/jitkernels/registry.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace jitcode { + +void VXXJitCode::genCode() { + // do not need push stack, and do not need save avx512reg if do not use avx512 + int offset = 0; + if (with_relu_) { + vxorps(ymm_zero, ymm_zero, ymm_zero); + } + if (scalar_index_ == 1) { + vbroadcastss(ymm_src1, ptr[param1]); + } else if (scalar_index_ == 2) { + vbroadcastss(ymm_src2, ptr[param2]); + } + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + if (scalar_index_ != 1) { + vmovups(ymm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(ymm_src2, ptr[param2 + offset]); + } + if (type_ == operand_type::mul) { + vmulps(ymm_dst, ymm_src1, ymm_src2); + } else if (type_ == operand_type::add) { + vaddps(ymm_dst, ymm_src1, ymm_src2); + } + if (with_relu_) { + vmaxps(ymm_dst, ymm_zero, ymm_dst); + } + vmovups(ptr[param3 + offset], ymm_dst); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + int rest = num_ % YMM_FLOAT_BLOCK; + while (rest > 0) { + int block = XMM_FLOAT_BLOCK; + if (rest >= 4) { + block = 4; + if (scalar_index_ != 1) { + vmovups(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(xmm_src2, ptr[param2 + offset]); + } + } else if (rest >= 2) { + block = 2; + if (scalar_index_ != 1) { + vmovq(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovq(xmm_src2, ptr[param2 + offset]); + } + } else { + block = 1; + if (scalar_index_ != 1) { + vmovss(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovss(xmm_src2, ptr[param2 + offset]); + } + } + switch (type_) { + case operand_type::mul: + vmulps(xmm_dst, xmm_src1, xmm_src2); + break; + case operand_type::add: + vaddps(xmm_dst, xmm_src1, xmm_src2); + break; + default: + break; + } + if (with_relu_) { + vmaxps(xmm_dst, xmm_zero, xmm_dst); + } + if (rest >= 4) { + vmovups(ptr[param3 + offset], xmm_dst); + } else if (rest >= 2) { + vmovq(ptr[param3 + offset], xmm_dst); + } else { + vmovss(ptr[param3 + offset], xmm_dst); + } + offset += sizeof(float) * block; + rest -= block; + } + ret(); +} + +} // namespace jitcode + +template <> +std::unique_ptr CreateJitCode(int attr) { + if (UseJitCode(attr)) { + return make_unique( + attr, CodeSize(attr)); + } + return nullptr; +} + +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.h b/paddle/fluid/operators/jitkernels/jitcode/blas.h new file mode 100644 index 0000000000..a1aca97723 --- /dev/null +++ b/paddle/fluid/operators/jitkernels/jitcode/blas.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" + +namespace paddle { +namespace operators { +namespace jitkernels { +namespace jitcode { + +// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) +class VXXJitCode : public JitCode { + public: + const char* name() const override { + std::string base = "VXXJitCode"; + if (scalar_index_ == 1) { + base += "_Scalar"; + } else { + base += "_Vec"; + } + if (type_ == operand_type::mul) { + base += "_Mul"; + } else if (type_ == operand_type::add) { + base += "_Add"; + } + if (scalar_index_ == 2) { + base += "_Scalar"; + } else { + base += "_Vec"; + } + base += (with_relu_ ? "_Relu" : ""); + return base.c_str(); + } + explicit VXXJitCode(int d, operand_type type, int scalar_index, + bool with_relu, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), + num_(d), + type_(type), + scalar_index_(scalar_index), + with_relu_(with_relu) {} + // static bool init(int d, int scalar_index = 0); + void genCode() override; + + private: + int num_; + operand_type type_; + int scalar_index_; + bool with_relu_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + reg64_t param3{abi_param3}; + + xmm_t xmm_src1 = xmm_t(0); + xmm_t xmm_src2 = xmm_t(1); + xmm_t xmm_dst = xmm_t(2); + xmm_t xmm_zero = xmm_t(3); + + ymm_t ymm_src1 = ymm_t(0); + ymm_t ymm_src2 = ymm_t(1); + ymm_t ymm_dst = ymm_t(2); + ymm_t ymm_zero = ymm_t(3); +}; + +class VMulJitCode : public VXXJitCode { + public: + explicit VMulJitCode(int d, size_t code_size, void* code_ptr = nullptr) + : VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {} +}; + +} // namespace jitcode +} // namespace jitkernels +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h index 7e0b6442ed..a3582e5284 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -16,7 +16,7 @@ #include #include "paddle/fluid/operators/jitkernels/jitcode_base.h" -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/platform/cpu_info.h" #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" @@ -30,23 +30,102 @@ namespace jitcode { // Application Binary Interface constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), - abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); + abi_param4(Xbyak::Operand::RCX); -template -class VMulJitCode : public JitBase, public Xbyak::CodeGenerator { +constexpr Xbyak::Operand::Code g_abi_regs[] = { + Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, + Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15}; + +constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]); + +using reg64_t = const Xbyak::Reg64; +using reg32_t = const Xbyak::Reg32; +using xmm_t = const Xbyak::Xmm; +using ymm_t = const Xbyak::Ymm; +using zmm_t = const Xbyak::Zmm; +using Label = Xbyak::Label; + +typedef enum { + mul = 0, + add, + sub, + relu, + exp, + sigmoid, + tanh, + identity +} operand_type; + +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +#define DECLARE_JIT_CODE(codename) \ + const char* name() const override { return #codename; } + +class JitCode : public JitBase, public Xbyak::CodeGenerator { public: - VMulJitCode(Attr attr, size_t code_size, void* code_ptr = nullptr) + explicit JitCode(size_t code_size, void* code_ptr = nullptr) : Xbyak::CodeGenerator(code_size, code_ptr) { this->genCode(); } - virtual const char* name() const = 0; - virtual void genCode() = 0; - + size_t getSize() const override { return CodeGenerator::getSize(); } const unsigned char* getCodeInternal() override { const Xbyak::uint8* code = CodeGenerator::getCode(); return code; } + + virtual const char* name() const = 0; + virtual void genCode() = 0; + + protected: + Xbyak::Reg64 param1{abi_param1}; + const int EVEX_max_8b_offt = 0x200; + const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; + + virtual void preCode() { + for (int i = 0; i < num_g_abi_regs; ++i) { + push(Xbyak::Reg64(g_abi_regs[i])); + } + if (platform::jit::MayIUse(platform::jit::avx512f)) { + mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); + } + } + virtual void postCode() { + for (int i = 0; i < num_g_abi_regs; ++i) { + pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i])); + } + ret(); + } + void L(const char* label) { Xbyak::CodeGenerator::L(label); } + void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } + // Enhanced vector extension + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt, + bool bcast = false) { + int scale = 0; + // Learn from https://github.com/intel/mkl-dnn + if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { + offt = offt - 2 * EVEX_max_8b_offt; + scale = 1; + } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { + offt = offt - 4 * EVEX_max_8b_offt; + scale = 2; + } + auto re = Xbyak::RegExp() + base + offt; + if (scale) { + re = re + reg_EVEX_max_8b_offt * scale; + } + if (bcast) { + return zword_b[re]; + } else { + return zword[re]; + } + } }; } // namespace jitcode diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.cc b/paddle/fluid/operators/jitkernels/jitcode_base.cc index 417c4d4b9e..1da2af51f4 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.cc +++ b/paddle/fluid/operators/jitkernels/jitcode_base.cc @@ -13,6 +13,9 @@ * limitations under the License. */ #include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include +#include +#include DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); @@ -29,7 +32,7 @@ void JitBase::dumpCode(const unsigned char* code) const { counter++; std::ofstream fout(filename.str(), std::ios::out); if (fout.is_open()) { - fout.write(reinterpret_cast(code), getSize()); + fout.write(reinterpret_cast(code), this->getSize()); fout.close(); } } diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h index a164746561..ffec62163a 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -28,7 +28,7 @@ namespace jitkernels { // TODO(TJ): make these functions as virtual of a class // Every JitCode should estimate the code size itself -template +template size_t CodeSize(Attr attr) { return 4096; } @@ -43,13 +43,11 @@ bool UseJitCode(Attr attr) { template size_t GetKey(Attr attr); -class JitBase { +class JitBase : public Kernel { public: - JitBase() = default; - virtual ~JitBase() = default; virtual const char* name() const = 0; virtual const unsigned char* getCodeInternal() = 0; - + virtual size_t getSize() const = 0; template const FUNC getCode() { const unsigned char* code = this->getCodeInternal(); @@ -58,14 +56,17 @@ class JitBase { } return reinterpret_cast(code); } - DISABLE_COPY_AND_ASSIGN(JitBase); protected: - void dumpCode(const unsigned char* code); + void dumpCode(const unsigned char* code) const; }; -template -std::shared_ptr CreateJitCode(Attr attr); +template +std::unique_ptr CreateJitCode(Attr attr); //{ +// if (UseJitCode) { +// return make_unique(attr, CodeSize()); +// } +// } } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernels.h index 866f72cce0..f398093dfe 100644 --- a/paddle/fluid/operators/jitkernels/kernels.h +++ b/paddle/fluid/operators/jitkernels/kernels.h @@ -31,6 +31,9 @@ namespace jitkernels { template class JitCodePool { + typedef std::unique_ptr JitBasePtr; + typedef std::unordered_map JitBaseMap; + public: JitCodePool() = default; static JitCodePool& Instance() { @@ -38,29 +41,26 @@ class JitCodePool { return g_jit_codes; } - std::shared_ptr Get(size_t key) const { - if (codes_.find(key) == codes_.end()) { - return nullptr; - } - return codes_.at(key); - } + const JitBaseMap& AllKernels() { return codes_; } + + bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, const std::shared_ptr& value) { - codes_.insert({key, value}); + void Insert(size_t key, JitBasePtr value) { + codes_.emplace(key, std::move(value)); } private: - std::unordered_map> codes_; + JitBaseMap codes_; DISABLE_COPY_AND_ASSIGN(JitCodePool); }; // TODO(TJ): std::tuple -template -struct KernelAttr { - typedef T data_type; - typedef Func return_type; - typedef Attr attr_type; -}; +// template +// struct KernelAttr { +// typedef T data_type; +// typedef Func return_type; +// typedef Attr attr_type; +// }; typedef std::unique_ptr KernelPtr; typedef std::unordered_map, KernelKey::Hash> @@ -123,20 +123,21 @@ inline Func GetRefer() { // TODO(TJ): make tuple? named KernelAttr template -Func Get(Attr attr) { - // size_t key = GetKey(attr); - // auto jitcode = JitCodePool().Instance().Get(key); - // if (jitcode) { - // return jitcode->template getCode(); - // } - - if (std::is_same::value && - std::is_same::value) { // TODO(TJ): float move to create - // auto p = CreateJitCode(attr); - // if (p) { - // JitCodePool().Instance().Insert(key, p); - // return p->template getCode(); - // } +const Func Get(Attr attr) { + size_t key = GetKey(attr); + auto& codes = JitCodePool().Instance(); + if (codes.Has(key)) { + return codes.AllKernels().at(key)->template getCode(); + } + + if (std::is_same::value) { // TODO(TJ): float + // move to create + auto p = CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } } // pool: (KernelKey(type, place), vector) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 663a9fbf4e..fd31ef77b4 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,7 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { // remove this namespace +namespace jit { typedef enum { isa_any, sse42, From b523787f9ffb0a97b1dda1c445f04e7490b7c4f5 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Dec 2018 12:07:03 +0000 Subject: [PATCH 135/719] remove jit namespace test=develop --- paddle/fluid/operators/attention_lstm_op.cc | 16 +- .../fused/fused_embedding_fc_lstm_op.cc | 6 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 6 +- paddle/fluid/operators/math/cpu_vec.h | 148 +++++++++--------- paddle/fluid/operators/math/cpu_vec_test.cc | 54 ++++--- paddle/fluid/operators/math/jit_code.cc | 2 +- paddle/fluid/operators/math/jit_code.h | 2 +- paddle/fluid/operators/math/jit_gen.cc | 2 +- paddle/fluid/operators/math/jit_kernel.cc | 2 - .../fluid/operators/math/jit_kernel_blas.cc | 3 +- .../operators/math/jit_kernel_crf_decode.cc | 24 ++- paddle/fluid/operators/math/jit_kernel_exp.cc | 1 - .../operators/math/jit_kernel_layer_norm.cc | 22 ++- .../fluid/operators/math/jit_kernel_macro.h | 37 +++-- .../fluid/operators/math/jit_kernel_test.cc | 2 +- paddle/fluid/platform/cpu_info.cc | 2 - paddle/fluid/platform/cpu_info.h | 3 - paddle/fluid/platform/init.cc | 14 +- 18 files changed, 167 insertions(+), 179 deletions(-) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 9b943440a8..75fc59125f 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + math::vec_relu(n, x, y); } } @@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { @@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 6d463538d2..1eb6523a2d 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); \ auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ + if (platform::MayIUse(platform::avx)) { \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 288b56fc24..17ed9771d0 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 7d81aee596..e1e4d168db 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -77,7 +77,7 @@ inline void vec_scal(const int n, const double a, double* x) { #endif // MKL scal only support inplace, choose this if src and dst are not equal -template +template inline void vec_scal(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a * x[i]; @@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); return; } const int rest = n % block; @@ -114,24 +114,24 @@ inline void vec_scal(const int n, const float a, y[i] = a * x[i]; } #else - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); #endif } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { - vec_scal(n, a, x, y); +inline void vec_scal(const int n, const float a, + const float* x, float* y) { + vec_scal(n, a, x, y); } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); } -template +template inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a - x[i]; @@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); return; } const int rest = n % block; @@ -168,27 +168,25 @@ inline void vec_bias_sub(const int n, const float a, y[i] = a - x[i]; } #else - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); #endif } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { - vec_bias_sub(n, a, x, y); +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { + vec_bias_sub(n, a, x, y); } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); } // out = x*y + (1-x)*z -template +template inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { for (int i = 0; i < n; ++i) { out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; @@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { } template <> -inline void vec_cross(const int n, const float* x, - const float* y, const float* z, - float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); return; } const int rest = n % block; @@ -228,25 +226,26 @@ inline void vec_cross(const int n, const float* x, out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; } #else - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); #endif } template <> -inline void vec_cross(const int n, const float* x, - const float* y, - const float* z, float* out) { - vec_cross(n, x, y, z, out); +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { + vec_cross(n, x, y, z, out); } template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { // TODO(TJ): enable me - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); } -template +template inline void vec_add_bias(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] + a; @@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); return; } const int rest = n % block; @@ -283,32 +282,30 @@ inline void vec_add_bias(const int n, const float a, y[i] = x[i] + a; } #else - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); #endif } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { - vec_add_bias(n, a, x, y); +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { + vec_add_bias(n, a, x, y); } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); } -template +template inline void vec_identity(const int n, const T* x, T* y) { // do nothing return; } -template +template inline void vec_sigmoid(const int n, const T* x, T* y) { const T min = SIGMOID_THRESHOLD_MIN; const T max = SIGMOID_THRESHOLD_MAX; @@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); return; } const int rest = n % block; @@ -377,25 +374,24 @@ inline void vec_sigmoid(const int n, const float* x, y[i] = 1.f / (1.f + y[i]); } #else - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); #endif } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { - vec_sigmoid(n, x, y); +inline void vec_sigmoid(const int n, const float* x, + float* y) { + vec_sigmoid(n, x, y); } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); } -template +template inline void vec_tanh(const int n, const T* x, T* y) { vec_scal(n, static_cast(2), x, y); vec_sigmoid(n, y, y); @@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { } // TODO(TJ): make relu clip -template +template inline void vec_relu(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] > 0 ? x[i] : 0; @@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) { } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block * 4) { - vec_relu(n, x, y); + vec_relu(n, x, y); return; } @@ -441,26 +437,26 @@ inline void vec_relu(const int n, const float* x, #undef MOVE_ONE_STEP #else - vec_relu(n, x, y); + vec_relu(n, x, y); #endif } template <> -inline void vec_relu(const int n, const float* x, - float* y) { - vec_relu(n, x, y); +inline void vec_relu(const int n, const float* x, + float* y) { + vec_relu(n, x, y); } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_relu(n, x, y); + vec_relu(n, x, y); } // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary -template +template class VecActivations { public: std::function operator()( diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index c37fa291a2..28eb9cadc9 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function tgt, } TEST(CpuVecTest, sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, + ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, + ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -162,38 +166,40 @@ void TestInplace(const int n, std::function tgt, } TEST(CpuVecTest, inplace_sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, inplace_tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, inplace_relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 52cbdf685d..78d0c3e880 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -22,7 +22,7 @@ namespace math { namespace jitkernel { namespace gen { -using namespace platform::jit; // NOLINT +using namespace platform; // NOLINT bool VXXJitCode::init(int d, int scalar_index) { // It's not necessary to use avx512 since it would slow down the frequency diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index a921462129..e2b4761435 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -179,7 +179,7 @@ class VActJitCode : public JitCode { template void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { - using namespace platform::jit; // NOLINT + using namespace platform; // NOLINT // check all idx can not equal JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc index 6af39518ed..5c6672928e 100644 --- a/paddle/fluid/operators/math/jit_gen.cc +++ b/paddle/fluid/operators/math/jit_gen.cc @@ -36,7 +36,7 @@ void JitCode::preCode() { for (int i = 0; i < num_g_abi_regs; ++i) { push(Xbyak::Reg64(g_abi_regs[i])); } - if (platform::jit::MayIUse(platform::jit::avx512f)) { + if (platform::MayIUse(platform::avx512f)) { mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); } } diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc index 68b708b345..118696ba47 100644 --- a/paddle/fluid/operators/math/jit_kernel.cc +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -21,8 +21,6 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - KernelPool& KernelPool::Instance() { static thread_local KernelPool g_jit_kernels; return g_jit_kernels; diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index a0f93fd8e7..8cf588efba 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -30,7 +30,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML template @@ -125,7 +124,7 @@ bool VMulKernelImpl::useJIT(int d) { #ifdef PADDLE_WITH_MKLML template <> bool VMulKernelImpl::useMKL(int d) { - return jit::MayIUse(jit::avx512f) && d > 512; + return platform::MayIUse(platform::avx512f) && d > 512; } template <> diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index 4d26b81948..eeb305a88b 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -25,10 +25,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* CRF Decode JitKernel */ -template +template class CRFDecodeKernelImpl : public CRFDecodeKernel { public: explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel() { @@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(YMM_FLOAT_BLOCK) \ @@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX512_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(ZMM_FLOAT_BLOCK) \ @@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16); INTRIAVX_FLOAT(kGT16); #endif #ifdef __AVX2__ -INTRIAVX2_FLOAT(jit::avx2, kEQ8); -INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX2_FLOAT(jit::avx2, kEQ16); -INTRIAVX2_FLOAT(jit::avx2, kGT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ8); +INTRIAVX2_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ16); +INTRIAVX2_FLOAT(platform::avx2, kGT16); #endif #ifdef __AVX512F__ -INTRIAVX2_FLOAT(jit::avx512f, kEQ8); -INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx512f, kEQ8); +INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16); INTRIAVX512_FLOAT(kEQ16); INTRIAVX512_FLOAT(kGT16); #endif diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 686f3dd983..7945cfb253 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -29,7 +29,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index 49904e6e8c..fead13ebad 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -22,10 +22,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* Layer Norm JitKernel */ -template +template class LayerNormKernelImpl : public LayerNormKernel { public: explicit LayerNormKernelImpl(int right) : LayerNormKernel() { @@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel { this->end_ = this->num_ - this->rest_; \ } \ template <> \ - void LayerNormKernelImpl::Compute( \ + void LayerNormKernelImpl::Compute( \ float* x, float* out, float* mean, float* var, const float* scale, \ const float* bias, int height, const float epsilon) const { \ __m256 sum; \ @@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel { } #ifdef __AVX__ -INTRIAVX_FLOAT(jit::avx, kEQ8); -INTRIAVX_FLOAT(jit::avx, kGT8LT16); -INTRIAVX_FLOAT(jit::avx, kEQ16); -INTRIAVX_FLOAT(jit::avx, kGT16); +INTRIAVX_FLOAT(platform::avx, kEQ8); +INTRIAVX_FLOAT(platform::avx, kGT8LT16); +INTRIAVX_FLOAT(platform::avx, kEQ16); +INTRIAVX_FLOAT(platform::avx, kGT16); #endif #ifdef __AVX2__ -INTRIAVX_FLOAT(jit::avx2, kEQ8); -INTRIAVX_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX_FLOAT(jit::avx2, kEQ16); -INTRIAVX_FLOAT(jit::avx2, kGT16); +INTRIAVX_FLOAT(platform::avx2, kEQ8); +INTRIAVX_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX_FLOAT(platform::avx2, kEQ16); +INTRIAVX_FLOAT(platform::avx2, kGT16); #endif #undef INTRIAVX_FLOAT diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 5a3efd979f..4dba3b5681 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -92,7 +92,6 @@ namespace jitkernel { JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ JITKERNEL_IMPL) -namespace jit = platform::jit; // TODO(TJ): below defines are deprecated, would be remove recently #define SEARCH_BLOCK(macro_, ker, dtype, isa) \ if (d < YMM_FLOAT_BLOCK) { \ @@ -107,15 +106,15 @@ namespace jit = platform::jit; macro_(ker, dtype, isa, kGT16); \ } -#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ - if (jit::MayIUse(jit::avx512f)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ - } else if (jit::MayIUse(jit::avx2)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ - } else if (jit::MayIUse(jit::avx)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ - } else { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (platform::MayIUse(platform::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \ + } else if (platform::MayIUse(platform::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx2); \ + } else if (platform::MayIUse(platform::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \ } #define JITKERNEL_KEY(ker_key, dtype_key) \ @@ -156,10 +155,10 @@ namespace jit = platform::jit; marco_declare, macro_key, macro_impl) #define FOR_EACH_ISA(macro_, block) \ - macro_(jit::avx512f, block); \ - macro_(jit::avx2, block); \ - macro_(jit::avx, block); \ - macro_(jit::isa_any, block) + macro_(platform::avx512f, block); \ + macro_(platform::avx2, block); \ + macro_(platform::avx, block); \ + macro_(platform::isa_any, block) #define FOR_EACH_BLOCK(macro_, isa) \ macro_(isa, kLT8); \ @@ -168,11 +167,11 @@ namespace jit = platform::jit; macro_(isa, kEQ16); \ macro_(isa, kGT16) -#define FOR_EACH_ISA_BLOCK(macro_) \ - FOR_EACH_BLOCK(macro_, jit::avx512f); \ - FOR_EACH_BLOCK(macro_, jit::avx2); \ - FOR_EACH_BLOCK(macro_, jit::avx); \ - FOR_EACH_BLOCK(macro_, jit::isa_any) +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, platform::avx512f); \ + FOR_EACH_BLOCK(macro_, platform::avx2); \ + FOR_EACH_BLOCK(macro_, platform::avx); \ + FOR_EACH_BLOCK(macro_, platform::isa_any) } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index ed86a47e15..19f7bd8909 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -705,7 +705,7 @@ TEST(JitKernel, pool) { jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); // empty call it to avoid unknown flag 'use_pinned_memory' on Mac - paddle::platform::jit::MayIUse(paddle::platform::jit::avx); + paddle::platform::MayIUse(paddle::platform::avx); const auto& plstm1 = jit::KernelPool::Instance() .template Get, const jit::lstm_attr_t&>(attr); diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index d466f28d1e..f9a32bfa4c 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } -namespace jit { #ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { @@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) { } #endif -} // namespace jit } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index fd31ef77b4..55dba545ff 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { typedef enum { isa_any, sse42, @@ -55,7 +54,5 @@ typedef enum { // May I use some instruction bool MayIUse(const cpu_isa_t cpu_isa); -} // namespace jit - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 51b46450e4..0d10d82d74 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) - if (platform::jit::MayIUse(platform::jit::avx)) { + if (platform::MayIUse(platform::avx)) { #ifndef __AVX__ LOG(WARNING) << "AVX is available, Please re-compile on local machine"; #endif @@ -131,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector devices) { " version or compile from source code." #ifdef __AVX512F__ - if (!platform::jit::MayIUse(platform::jit::avx512f)) { - if (platform::jit::MayIUse(platform::jit::avx2)) { + if (!platform::MayIUse(platform::avx512f)) { + if (platform::MayIUse(platform::avx2)) { AVX_GUIDE(AVX512, AVX2); - } else if (platform::jit::MayIUse(platform::jit::avx)) { + } else if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX512, AVX); } else { AVX_GUIDE(AVX512, NonAVX); @@ -143,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX2__ - if (!platform::jit::MayIUse(platform::jit::avx2)) { - if (platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx2)) { + if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX2, AVX); } else { AVX_GUIDE(AVX2, NonAVX); @@ -153,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX__ - if (!platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX, NonAVX); } #endif From a1eb21e704b570eaf5f5bf571f85ffcc2bc613b0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 5 Dec 2018 12:24:19 +0000 Subject: [PATCH 136/719] refine names --- paddle/fluid/operators/jitkernels/CMakeLists.txt | 6 ++++-- paddle/fluid/operators/jitkernels/jitcode/jitcode.h | 2 +- .../operators/jitkernels/{kernels.cc => kernel_pool.cc} | 2 +- .../fluid/operators/jitkernels/{kernels.h => kernel_pool.h} | 0 paddle/fluid/operators/jitkernels/more/mkl/mkl.h | 2 +- paddle/fluid/operators/jitkernels/registry.h | 2 +- paddle/fluid/operators/jitkernels/test.cc | 2 +- 7 files changed, 9 insertions(+), 7 deletions(-) rename paddle/fluid/operators/jitkernels/{kernels.cc => kernel_pool.cc} (94%) rename paddle/fluid/operators/jitkernels/{kernels.h => kernel_pool.h} (100%) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt index e82e6c3026..f6bb3e0712 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt @@ -7,7 +7,9 @@ set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) -cc_library(jit_kernel_base SRCS kernels.cc jitcode_base.cc DEPS ${JIT_KERNEL_DEPS}) +file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") +list(REMOVE_ITEM jit_kernel_cc_srcs jit_test.cc) +cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) add_subdirectory(refer) add_subdirectory(more) @@ -15,5 +17,5 @@ if(WITH_XBYAK) add_subdirectory(jitcode) endif() -cc_library(jit_kernel SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS}) +cc_library(jit_kernel SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h index a3582e5284..03c2100ca0 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h @@ -92,7 +92,7 @@ class JitCode : public JitBase, public Xbyak::CodeGenerator { for (int i = 0; i < num_g_abi_regs; ++i) { push(Xbyak::Reg64(g_abi_regs[i])); } - if (platform::jit::MayIUse(platform::jit::avx512f)) { + if (platform::MayIUse(platform::avx512f)) { mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); } } diff --git a/paddle/fluid/operators/jitkernels/kernels.cc b/paddle/fluid/operators/jitkernels/kernel_pool.cc similarity index 94% rename from paddle/fluid/operators/jitkernels/kernels.cc rename to paddle/fluid/operators/jitkernels/kernel_pool.cc index 35095220e3..9bb0ba349b 100644 --- a/paddle/fluid/operators/jitkernels/kernels.cc +++ b/paddle/fluid/operators/jitkernels/kernel_pool.cc @@ -12,7 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/operators/jitkernels/kernel_pool.h" #include // for shared_ptr #include #include diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernel_pool.h similarity index 100% rename from paddle/fluid/operators/jitkernels/kernels.h rename to paddle/fluid/operators/jitkernels/kernel_pool.h diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h index 7cb4334e50..75ed34ef48 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h @@ -41,7 +41,7 @@ class VMulKernel VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { if (std::is_same::value) { - return platform::jit::MayIUse(platform::jit::avx512f) && d > 512; + return platform::MayIUse(platform::avx512f) && d > 512; } else { return true; } diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h index 62a0de3641..cd414bb096 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -18,7 +18,7 @@ #include #include #include "paddle/fluid/operators/jitkernels/kernel_base.h" -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/operators/jitkernels/kernel_pool.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/variant.h" // for UNUSED diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc index d11c7afe9a..eb0d30eecd 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -19,7 +19,7 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/jitkernels/kernels.h" +#include "paddle/fluid/operators/jitkernels/kernel_pool.h" // TODO(TJ): remove me #include "paddle/fluid/operators/jitkernels/registry.h" From e05e1d7d88edebf14a093aa5a31924ef0b650480 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 5 Dec 2018 12:45:34 +0000 Subject: [PATCH 137/719] fix bug in dist train on hs, test=develop --- paddle/fluid/operators/hierarchical_sigmoid_op.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index b326b58319..0dbcc442df 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -150,13 +150,12 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { "Output(W@Grad should not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@Grad should not be null."); - if (!ctx->Attrs().Get("is_sparse")) { - if (ctx->HasOutput(framework::GradVarName("Bias"))) { - ctx->SetOutputDim(framework::GradVarName("Bias"), - ctx->GetInputDim("Bias")); - } - ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); } + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); } From 0492158da526ddb8b789e8d8dcf79679c831179d Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 5 Dec 2018 21:51:51 +0800 Subject: [PATCH 138/719] polish test=develop --- python/paddle/fluid/framework.py | 7 +++---- python/paddle/fluid/imperative/layers.py | 9 ++------- python/paddle/fluid/layer_helper.py | 17 +---------------- .../fluid/tests/unittests/test_imperative.py | 6 +++--- 4 files changed, 9 insertions(+), 30 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index e31b09fc7c..caf31ccf5c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -293,7 +293,6 @@ class Variable(core.VarBase): if is_new_var: self.desc.set_type(type) elif self.desc.type() != type: - # sys.stderr.write('%s vs %s\n' % (self.desc.type(), type)) raise ValueError("Variable {0} has been created before. The " "previous type is {1}; the new type is {2}. They" " are not matched".format(self.name, @@ -358,16 +357,16 @@ class Variable(core.VarBase): self.stop_gradient = stop_gradient self.is_data = is_data - def numpy(self): + def _numpy(self): scope = _imperative_tracer().get_scope(self.block.desc) tensor = core.get_variable_tensor(scope, self.desc.name()) return np.array(tensor) - def backward(self): + def _backward(self): scope = _imperative_tracer().get_scope(self.block.desc) self._run_backward(scope) - def grad(self): + def _gradient(self): return np.array(self._grad()) def __str__(self): diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index cb54a36a5e..1a28f7f4ae 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -35,13 +35,8 @@ class PyLayer(core.Layer): var_inputs = [] for x in inputs: - if isinstance(x, np.ndarray): - py_var = base.to_variable(x) - var_inputs.append(py_var) - elif isinstance(x, framework.Variable): - var_inputs.append(x) - else: - raise ValueError("not var or ndarray %s" % type(x)) + py_var = base.to_variable(x) + var_inputs.append(py_var) outputs = self.forward(var_inputs) return outputs diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 25fc843bf5..74b4a977db 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -49,23 +49,8 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() - def _np_to_variable(self, x): - tensor = core.LoDTensor() - tensor.set(x, core.CPUPlace()) - return Variable( - self.main_program.current_block(), - type=core.VarDesc.VarType.LOD_TENSOR, - name=None, - shape=x.shape, - dtype=x.dtype) - def to_variable(self, x): - if isinstance(x, Variable): - return x - elif isinstance(x, np.ndarray): - return base.to_variable(x, self.main_program.current_block()) - else: - raise ValueError("inputs wrong type %s\n" % x) + return base.to_variable(x, self.main_program.current_block()) def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 5413bdc24e..b5b6305155 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -43,9 +43,9 @@ class TestImperative(unittest.TestCase): l = MyLayer() x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x.numpy())) - x.backward() - sys.stderr.write("grad %s\n" % l._x_for_debug.grad()) + sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + x._backward() + sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) if __name__ == '__main__': From 722b0a805f99015a65b19721b5ad0ca9420a3ba6 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 5 Dec 2018 15:44:41 +0000 Subject: [PATCH 139/719] fix bug of trt pool test=develop --- paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 343fd3f7c5..768318fb06 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -109,8 +109,8 @@ class Pool2dOpConverter : public OpConverter { } if (pool_type == "max") { - nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]); - nvinfer1::DimsHW post_pad(paddings[0], paddings[1]); + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, From 5026741b823509f007f47cd53a305e800c9ad3aa Mon Sep 17 00:00:00 2001 From: lujun Date: Thu, 6 Dec 2018 02:29:40 +0800 Subject: [PATCH 140/719] fix the bug for mac build. python -c error. test=develop --- paddle/scripts/paddle_build.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 912ba8f005..6299b166af 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -442,8 +442,6 @@ EOF make install -j 8 if [ "$1" == "cp27-cp27m" ]; then pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl - set -e - python -c "import paddle.fluid" elif [ "$1" == "cp35-cp35m" ]; then pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl elif [ "$1" == "cp36-cp36m" ]; then From 0f96c2e80f5dabd151322abcb9d2d5a4d1cf6665 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 5 Dec 2018 17:07:12 +0800 Subject: [PATCH 141/719] fix thread-safety bug test=develop --- paddle/fluid/platform/device_context.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 5444fe3175..146a205832 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -120,14 +120,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } void* allocate(size_t num_bytes) const override { + if (UNLIKELY(num_bytes == 0)) { + return nullptr; + } auto buf = paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kScratchpad); void* retv = buf->ptr(); - allocations_[buf->ptr()] = std::move(buf); + { + std::lock_guard lock(mtx_); + allocations_.emplace(retv, std::move(buf)); + } return retv; } - void deallocate(void* buffer) const override { allocations_.erase(buffer); } + void deallocate(void* buffer) const override { + if (LIKELY(buffer)) { + std::lock_guard lock(mtx_); + allocations_.erase(buffer); + } + } void* scratchpad() const override { if (scratch_ == NULL) { @@ -153,6 +164,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { const cudaDeviceProp* device_prop_; // not owned; mutable void* scratch_; mutable unsigned int* semaphore_; + mutable std::mutex mtx_; // to protect allocations_ mutable std::unordered_map allocations_; }; From 549f165b593e6de3335e5f65d9b94c1d4ca410f8 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Thu, 6 Dec 2018 11:36:52 +0800 Subject: [PATCH 142/719] Speed conv_fusion_op for identity activation. (#14744) * Refine conv_fusion_op for identity activation. * Fix unit testing. test=develop --- paddle/fluid/operators/conv_fusion_op.cu.cc | 52 +++++++++++++------ .../tests/unittests/test_conv2d_fusion_op.py | 6 +++ 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 2c09ee7394..3235ad52b9 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); - if (activation == "identity") { - // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is - // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - } else if (!exhaustive_search) { + if (!exhaustive_search) { CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, @@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // ------------------- cudnn conv+bias+act forward -------------------- - ScalingParamType alpha1 = 1.0f; - ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + if ((activation == "identity") && + (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && + (!residual)) { + // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is + // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. + // But test in some case, the speed is slower, change to use + // cudnnConvolutionForward and cudnnAddTensor + // ------------- cudnn conv forward and bias add --------------------- + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( + handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } else { + if (activation == "identity") { + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } + // ------------------- cudnn conv+bias+act forward -------------------- + ScalingParamType alpha1 = 1.0f; + ScalingParamType alpha2 = residual ? 1.0f : 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } } }; #endif diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index 9f3f2f3481..6cd71e39e4 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp): self.activation = 'identity' +class TestIdentityActivation(TestConv2dFusionOp): + def init_activation(self): + self.activation = 'identity' + self.add_residual_data = False + + class TestWithGroup(TestConv2dFusionOp): def init_group(self): self.groups = 3 From 570d89ec84296dd46725be4f854808e0f1fb5f1c Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Thu, 6 Dec 2018 16:52:59 +0800 Subject: [PATCH 143/719] add bpr_loss operator , test=develop --- paddle/fluid/operators/bpr_loss_op.cc | 149 ++++++++++++++++++ paddle/fluid/operators/bpr_loss_op.h | 142 +++++++++++++++++ python/paddle/fluid/layers/nn.py | 13 ++ .../fluid/tests/unittests/test_bpr_loss_op.py | 53 +++++++ 4 files changed, 357 insertions(+) create mode 100644 paddle/fluid/operators/bpr_loss_op.cc create mode 100644 paddle/fluid/operators/bpr_loss_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_bpr_loss_op.py diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc new file mode 100644 index 0000000000..3e6445dbc2 --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bpr_loss_op.h" + +namespace paddle { +namespace operators { + +class BprLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), + "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_Pos_dims = ctx->GetInputDim("Label_Pos"); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ( + rank, label_Pos_dims.size(), + "Input(X) and Input(Label_Pos) shall have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_Pos_dims, 0, rank - 1), + "Input(X) and Input(Label_Pos) shall have the same shape " + "except the last dimension."); + + auto y_dims = x_dims; + y_dims[rank - 1] = 1; + ctx->SetOutputDim("Y", y_dims); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of Seq-bpr + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class BprLossGradientOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), + "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_pos_dims = ctx->GetInputDim("Label_Pos"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(dy_dims.size(), rank, + "Input(Y@Grad) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ( + label_pos_dims.size(), rank, + "Input(Label_Pos) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_pos_dims, 0, rank - 1), + "The Input(X) and Input(Label_Pos) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); + PADDLE_ENFORCE_EQ(label_pos_dims[rank - 1], 1, + " the last dimension of Input(Label_Pos) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "real number."); + AddInput( + "Label_Pos", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. the last dimension " + "size is 1."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the sequence bpr loss."); + AddComment(R"DOC( +BprLoss Operator. + +This operator belongs to pairwise ranking loss. Label_pos is the desired item. +The loss at a given point in one seesion is defined as: +$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ + +Learn more details by reading paper . + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); +REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel, + ops::BprLossOpKernel); +REGISTER_OP_CPU_KERNEL(bpr_loss_grad, + ops::BprLossGradientOpKernel, + ops::BprLossGradientOpKernel); diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h new file mode 100644 index 0000000000..4103686de7 --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; + return x; + } +}; + +template +class BprLossOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* labels_Pos = ctx.Input("Label_Pos"); + auto* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + int rank = x->dims().size(); + + Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); + Tensor labels_Pos_2d = framework::ReshapeToMatrix(*labels_Pos, rank - 1); + Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); + + const framework::Tensor* prob = &x_2d; + const framework::Tensor* labels_pos = &labels_Pos_2d; + framework::Tensor* out = &y_2d; + + const int step_size = prob->dims()[0]; + const int class_num = prob->dims()[1]; + const T* prob_data = prob->data(); + T* loss_data = out->data(); + + const int64_t* label_pos_data = labels_pos->data(); + for (int i = 0; i < step_size; ++i) { + int lbl_pos = label_pos_data[i]; + PADDLE_ENFORCE_GE(lbl_pos, 0); + PADDLE_ENFORCE_LT(lbl_pos, class_num); + int index_pos = i * class_num + lbl_pos; + T sum = static_cast(0); + for (int j = 0; j < class_num; j++) { + if (j == lbl_pos) continue; + int index_neg = i * class_num + j; + sum += TolerableValue()(-std::log( + 1.0f + TolerableValue()( + std::exp(prob_data[index_neg] - prob_data[index_pos])))); + } + loss_data[i] = -sum / (class_num - 1); + } + } +}; + +template +class XeGradFunctor { + public: + XeGradFunctor(T* dx, + const T* dy, // NOLINT + const T* x, // NOLINT + const int64_t* label_pos, // NOLINT + size_t num_classes) + : dx_(dx), + dy_(dy), + x_(x), + label_pos_(label_pos), + num_classes_(num_classes) {} + + HOSTDEVICE void operator()(size_t sample_id) { + for (size_t x_offset = sample_id * num_classes_; + x_offset < (sample_id + 1) * num_classes_; ++x_offset) { + dx_[x_offset] = static_cast(0); + } + auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; + for (size_t ni = 0; ni < num_classes_; ni++) { + if (label_pos_[sample_id] == ni) continue; + auto n_index = sample_id * num_classes_ + ni; + auto grad_ = + -dy_[sample_id] / + ((num_classes_ - 1) * + (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); + dx_[p_index] += grad_; + dx_[n_index] -= grad_; + } + } + + private: + T* dx_; + const T* dy_; + const T* x_; + const int64_t* label_pos_; + size_t num_classes_; +}; + +template +class BprLossGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* label_pos = ctx.Input("Label_Pos"); + auto* dx = ctx.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + + int rank = x->dims().size(); + int64_t class_num = x->dims()[rank - 1]; + XeGradFunctor functor(dx_data, dy->data(), x->data(), + label_pos->data(), + static_cast(class_num)); + platform::ForRange for_range( + ctx.template device_context(), + static_cast(dy->numel())); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4df74edfce..6d05ca8461 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -41,6 +41,7 @@ __all__ = [ 'crf_decoding', 'cos_sim', 'cross_entropy', + 'bpr_loss', 'square_error_cost', 'chunk_eval', 'sequence_conv', @@ -1175,6 +1176,18 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): return out +def bpr_loss(input, label_pos): + + helper = LayerHelper('bpr_loss', **locals()) + out = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type='bpr_loss', + inputs={'X': [input], + 'Label_Pos': [label_pos]}, + outputs={'Y': [out]}) + return out + + def square_error_cost(input, label): """ **Square error cost layer** diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py new file mode 100644 index 0000000000..7e18913a03 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -0,0 +1,53 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest, randomize_probability + + +class TestBprLossOp1(OpTest): + """Test BprLoss with discrete one-hot labels. + """ + + def setUp(self): + self.op_type = "bpr_loss" + batch_size = 3 + class_num = 5 + X = randomize_probability(batch_size, class_num, dtype='float64') + label_pos = np.random.randint( + 0, class_num, (batch_size, 1), dtype="int64") + bpr_loss_result = [] + for i in range(batch_size): + sum = 0.0 + for j in range(class_num): + if j == label_pos[i][0]: + continue + sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label_pos[i][0]]))) + bpr_loss_result.append(-sum / (class_num - 1)) + bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64") + self.inputs = {"X": X, "Label_Pos": label_pos} + self.outputs = {"Y": bpr_loss} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", numeric_grad_delta=0.001) + + +if __name__ == "__main__": + unittest.main() From 4cb0100c8ea714e4ce7f8c0cd3c9ebc50aff9e35 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 16:59:53 +0800 Subject: [PATCH 144/719] add prefetch in nce --- paddle/fluid/operators/nce_op.cc | 18 +++++ paddle/fluid/operators/nce_op.h | 67 ++++++++++++++++--- .../fluid/transpiler/distribute_transpiler.py | 2 +- 3 files changed, 78 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821d..06ff825fde 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -155,6 +155,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("is_sparse", "(boolean, default false) Sparse update.") .SetDefault(false); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); + AddAttr>("custom_neg_classes", "This attribute only be used in unitest. Classes " "in this list wiil be used as negative classes " diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index f2ca6ec247..8f82f77f50 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -144,15 +146,64 @@ class NCEKernel : public framework::OpKernel { } // forward mul auto input_mat = EigenMatrix::From(*(context.Input("Input"))); - auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - Eigen::Tensor result = - (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * - weight_mat.chip(sample_labels_data[i], 0)) - .sum(); - sample_out_data[i] += result(0); - sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + + std::vector labels; + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + labels.push_back(sample_labels_data[i]); + } + std::set st(labels.begin(), labels.end()); + labels.assign(st.begin(), st.end()); + + auto &local_scope = context.scope().NewScope(); + auto height_sections = context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + framework::Variable *ids = local_scope.Var("Ids"); + framework::Variable *weight = local_scope.Var("Weight"); + +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch("Ids", "Weight", table_names, epmap, + height_sections, context); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); + + auto weight_mat = EigenMatrix::From(*(weight->Get())); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + std::vector::iterator it = + std::find(labels.begin(), labels.end(), sample_labels_data[i]); + int idx = std::distance(labels.begin(), it); + + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(idx, 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } +#endif + } else { + auto weight_mat = + EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } } + // forward cost for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { out_data[i] = 0; diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 1d867d9194..817af602bd 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -239,7 +239,7 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table"] + sparse_update_op_types = ["lookup_table", "nce"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( 'remote_prefetch') is True and not op.attr( From 627a6b8bacc5f4898c1c3c9018fd8e70ef95d8dc Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 17:14:59 +0800 Subject: [PATCH 145/719] add prefetch in nce --- paddle/fluid/operators/nce_op.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 8f82f77f50..7397d9f473 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -26,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" #include "unsupported/Eigen/CXX11/Tensor" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -166,8 +170,8 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - framework::Variable *ids = local_scope.Var("Ids"); - framework::Variable *weight = local_scope.Var("Weight"); + local_scope.Var("Ids"); + local_scope.Var("Weight"); #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch("Ids", "Weight", table_names, epmap, From 405b2486db8bbbafad58e4362ef2dc0be0f74c7e Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Tue, 4 Dec 2018 18:59:49 +0800 Subject: [PATCH 146/719] support loading from memory test=develop --- .../fluid/framework/executor_thread_worker.cc | 2 +- paddle/fluid/inference/analysis/argument.h | 1 + .../analysis/passes/ir_graph_build_pass.cc | 7 ++- .../analysis/passes/ir_graph_build_pass.h | 5 +- paddle/fluid/inference/api/analysis_config.cc | 11 ++++ .../fluid/inference/api/analysis_predictor.cc | 38 ++++++------ .../inference/api/paddle_analysis_config.h | 10 ++- paddle/fluid/inference/io.cc | 34 ++++++++--- paddle/fluid/inference/io.h | 8 ++- .../tests/api/analyzer_ner_tester.cc | 24 ++++++-- .../inference/tests/api/config_printer.h | 9 ++- paddle/fluid/operators/impl/CMakeLists.txt | 1 + paddle/fluid/operators/impl/load_combine.cc | 61 +++++++++++++++++++ paddle/fluid/operators/impl/load_combine.h | 33 ++++++++++ paddle/fluid/operators/load_combine_op.cc | 37 +++++++---- 15 files changed, 229 insertions(+), 52 deletions(-) create mode 100644 paddle/fluid/operators/impl/CMakeLists.txt create mode 100644 paddle/fluid/operators/impl/load_combine.cc create mode 100644 paddle/fluid/operators/impl/load_combine.h diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 4e4001e979..e3849931d4 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() { static unsigned concurrency_cap = std::thread::hardware_concurrency(); int thread_id = this->thread_id_; - if (thread_id < concurrency_cap) { + if ((unsigned)thread_id < concurrency_cap) { unsigned proc = thread_id; cpu_set_t mask; diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 21203e2d9f..d205465abf 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -103,6 +103,7 @@ struct Argument { // Model specified with program and parameters files. DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); + DECL_ARGUMENT_FIELD(is_memory_load, IsMemoryLoad, bool); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index 740030c3a8..31138d7342 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { argument->model_params_path_valid()) { auto program = LoadModel(argument->model_program_path(), argument->model_params_path(), - argument->scope_ptr(), place); + argument->scope_ptr(), place, argument->is_memory_load()); argument->SetMainProgram(program.release()); } else { PADDLE_THROW( @@ -68,9 +68,10 @@ std::unique_ptr IrGraphBuildPass::LoadModel( std::unique_ptr IrGraphBuildPass::LoadModel( const std::string &program_path, const std::string ¶ms_path, - framework::Scope *scope, const platform::Place &place) { + framework::Scope *scope, const platform::Place &place, + bool is_memory_load) { framework::Executor exe(place); - return Load(&exe, scope, program_path, params_path); + return Load(&exe, scope, program_path, params_path, is_memory_load); } std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; } diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h index 271e64fce5..ae7de676d6 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -24,7 +24,7 @@ namespace inference { namespace analysis { /* - * Load program and parameter to memory from the disk. + * Load program and parameter to memory from the disk or directly from memory. */ class IrGraphBuildPass : public AnalysisPass { public: @@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass { const platform::Place &place); std::unique_ptr LoadModel( const std::string &program_path, const std::string ¶ms_path, - framework::Scope *scope, const platform::Place &place); + framework::Scope *scope, const platform::Place &place, + bool is_memory_load); std::string model_binary_str_; }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index dd75f0d9a6..bfa1f1908c 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -53,6 +53,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + is_memory_load_ = other.is_memory_load_; if (use_gpu) { pass_builder_.reset(new GpuPassStrategy( @@ -80,6 +81,8 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + is_memory_load_ = other.is_memory_load_; + pass_builder_ = std::move(other.pass_builder_); } @@ -102,4 +105,12 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); } +void contrib::AnalysisConfig::SetProgBufferAndParamBuffer( + const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, + size_t param_buffer_size) { + prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); + param_file = std::string(param_buffer, param_buffer + param_buffer_size); + is_memory_load_ = true; +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 391330a7c0..6091c5a625 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -304,20 +304,20 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, // NOTE All the members in AnalysisConfig should be copied to Argument. void AnalysisPredictor::OptimizeInferenceProgram() { + LOG(INFO) << "optimization program"; status_program_optimized_ = true; argument_.SetUseGPU(config_.use_gpu); argument_.SetGPUDeviceId(config_.device); + argument_.SetIsMemoryLoad(config_.is_memory_load_); // Analyze inference_program if (!config_.model_dir.empty()) { argument_.SetModelDir(config_.model_dir); - } else { - PADDLE_ENFORCE( - !config_.param_file.empty(), - "Either model_dir or (param_file, prog_file) should be set."); - PADDLE_ENFORCE(!config_.prog_file.empty()); + } else if (!config_.param_file.empty() && !config_.prog_file.empty()) { argument_.SetModelProgramPath(config_.prog_file); argument_.SetModelParamsPath(config_.param_file); + } else { + PADDLE_THROW("Either model_dir or (param_file, prog_file) should be set."); } if (config_.use_gpu && config_.use_tensorrt_) { @@ -448,20 +448,23 @@ bool AnalysisPredictor::LoadProgramDesc() { return false; } - std::string pb_content; - // Read binary - std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); - fin.seekg(0, std::ios::end); - - pb_content.resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(pb_content.at(0)), pb_content.size()); - fin.close(); - // Create ProgramDesc framework::proto::ProgramDesc proto; - proto.ParseFromString(pb_content); + if (!config_.is_memory_load()) { + std::string pb_content; + // Read binary + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); + fin.seekg(0, std::ios::end); + pb_content.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(pb_content.at(0)), pb_content.size()); + fin.close(); + + proto.ParseFromString(pb_content); + } else { + proto.ParseFromString(config_.prog_file); + } inference_program_.reset(new framework::ProgramDesc(proto)); return true; } @@ -469,6 +472,7 @@ bool AnalysisPredictor::LoadProgramDesc() { bool AnalysisPredictor::LoadParameters() { PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), "The inference program should be loaded first."); + const auto &global_block = inference_program_->MutableBlock(0); // create a temporary program to load parameters. diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index a09bd1cac2..4e9d5bc329 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -52,10 +52,15 @@ struct AnalysisConfig : public NativeConfig { bool use_tensorrt() const { return use_tensorrt_; } void EnableMKLDNN(); - // NOTE this is just for internal development, please not use it. - // NOT stable yet. bool use_mkldnn() const { return use_mkldnn_; } + // Specify the memory buffer of program and parameter + void SetProgBufferAndParamBuffer(const char* prog_buffer, + size_t prog_buffer_size, + const char* program_buffer, + size_t program_buffer_size); + bool is_memory_load() const { return is_memory_load_; } + friend class ::paddle::AnalysisPredictor; protected: @@ -64,6 +69,7 @@ struct AnalysisConfig : public NativeConfig { int tensorrt_workspace_size_; int tensorrt_max_batchsize_; std::unique_ptr pass_builder_; + bool is_memory_load_{false}; }; // Configurations for Anakin engine. diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 31f43bfdca..053c516f88 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/version.h" +#include "paddle/fluid/operators/impl/load_combine.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/pybind/pybind.h" @@ -69,7 +70,8 @@ bool IsPersistable(const framework::VarDesc* var) { void LoadPersistables(framework::Executor* executor, framework::Scope* scope, const framework::ProgramDesc& main_program, const std::string& dirname, - const std::string& param_filename) { + const std::string& param_filename, + bool is_memory_load = false) { const framework::BlockDesc& global_block = main_program.Block(0); framework::ProgramDesc* load_program = new framework::ProgramDesc(); @@ -108,6 +110,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, op->SetType("load_combine"); op->SetOutput("Out", paramlist); op->SetAttr("file_path", {param_filename}); + op->SetAttr("is_memory_load", {is_memory_load}); op->CheckAttrs(); } @@ -130,16 +133,23 @@ std::unique_ptr Load(framework::Executor* executor, "model version %ld is not supported.", main_program->Version()); - LoadPersistables(executor, scope, *main_program, dirname, ""); + // is_memory_load is false in seperate parameters. + LoadPersistables(executor, scope, *main_program, dirname, "", + false /* is_memory_load */); return main_program; } -std::unique_ptr Load( - framework::Executor* executor, framework::Scope* scope, - const std::string& prog_filename, const std::string& param_filename) { - std::string model_filename = prog_filename; +std::unique_ptr Load(framework::Executor* executor, + framework::Scope* scope, + const std::string& prog_filename, + const std::string& param_filename, + bool is_memory_load = false) { std::string program_desc_str; - ReadBinaryFile(model_filename, &program_desc_str); + if (!is_memory_load) { + ReadBinaryFile(prog_filename, &program_desc_str); + } else { + program_desc_str = prog_filename; + } std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); @@ -147,10 +157,18 @@ std::unique_ptr Load( "model version %ld is not supported.", main_program->Version()); - LoadPersistables(executor, scope, *main_program, "", param_filename); + LoadPersistables(executor, scope, *main_program, "", param_filename, + is_memory_load); return main_program; } +std::unique_ptr Load( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_filename, const std::string& param_filename) { + return Load(executor, scope, prog_filename, param_filename, + false /* is_memory_load */); +} + void SaveVars(const framework::Scope& scope, const std::vector& vars, const std::string& dirname, bool predicate) { diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index ab492577c1..20d635575d 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -30,7 +30,7 @@ void Init(const std::vector argv); void LoadPersistables(framework::Executor* executor, framework::Scope* scope, const framework::ProgramDesc& main_program, const std::string& dirname, - const std::string& param_filename); + const std::string& param_filename, bool is_memory_load); std::unique_ptr Load(framework::Executor* executor, framework::Scope* scope, @@ -41,6 +41,12 @@ std::unique_ptr Load(framework::Executor* executor, const std::string& prog_filename, const std::string& param_filename); +std::unique_ptr Load(framework::Executor* executor, + framework::Scope* scope, + const std::string& prog_filename, + const std::string& param_filename, + bool is_memory_load); + // Save the variables from a scope to disk. void SaveVars(const framework::Scope& scope, const std::vector& vars, const std::string& dirname, diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 3a5f844de3..aaa94c8937 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -93,9 +93,17 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; +void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { + if (memory_load) { + std::string buffer_prog, buffer_param; + ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog); + ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param); + cfg->SetProgBufferAndParamBuffer(&buffer_prog[0], buffer_prog.size(), + &buffer_param[0], buffer_param.size()); + } else { + cfg->prog_file = FLAGS_infer_model + "/__model__"; + cfg->param_file = FLAGS_infer_model + "/param"; + } cfg->use_gpu = false; cfg->device = 0; cfg->specify_input_name = true; @@ -114,9 +122,9 @@ void SetInput(std::vector> *inputs) { } // Easy for profiling independently. -TEST(Analyzer_Chinese_ner, profile) { +void profile(bool memory_load = false) { contrib::AnalysisConfig cfg; - SetConfig(&cfg); + SetConfig(&cfg, memory_load); std::vector outputs; std::vector> input_slots_all; @@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) { } } +TEST(Analyzer_Chinese_ner, profile) { profile(); } + +TEST(Analyzer_Chinese_ner, profile_memory_load) { + profile(true /* memory_load */); +} + // Check the fuse status TEST(Analyzer_Chinese_ner, fuse_statis) { contrib::AnalysisConfig cfg; diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index 4231eef722..c07a27674e 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "device: " << config.device << "\n"; os << GenSpaces(num_spaces) << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n"; - os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; - os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; os << GenSpaces(num_spaces) @@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os, os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; num_spaces++; os << *reinterpret_cast(&config); + if (!config.is_memory_load()) { + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; + } else { + os << GenSpaces(num_spaces) + << "prog_file and param_file: load from memory \n"; + } os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim << "\n"; os << GenSpaces(num_spaces) diff --git a/paddle/fluid/operators/impl/CMakeLists.txt b/paddle/fluid/operators/impl/CMakeLists.txt new file mode 100644 index 0000000000..1c0ff7c3d9 --- /dev/null +++ b/paddle/fluid/operators/impl/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(load_combine_impl SRCS load_combine.cc DEPS scope lod_tensor device_context op_registry data_type_transform) diff --git a/paddle/fluid/operators/impl/load_combine.cc b/paddle/fluid/operators/impl/load_combine.cc new file mode 100644 index 0000000000..454d583a10 --- /dev/null +++ b/paddle/fluid/operators/impl/load_combine.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/impl/load_combine.h" + +namespace paddle { +namespace operators { +namespace impl { + +void LoadParamsFromStream(const std::vector &out_var_names, + const paddle::platform::Place &place, + bool load_as_fp16, std::istream *buffer, + const paddle::framework::Scope *scope) { + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); + for (size_t i = 0; i < out_var_names.size(); i++) { + auto *out_var = scope->FindVar(out_var_names[i]); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", + out_var_names[i]); + + auto *tensor = out_var->GetMutable(); + + // Get data from fin to tensor + DeserializeFromStream(*buffer, tensor, *dev_ctx); + + auto in_dtype = framework::ToDataType(tensor->type()); + auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + out_var->Clear(); + tensor = out_var->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } + } +} + +} // namespace impl +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/impl/load_combine.h b/paddle/fluid/operators/impl/load_combine.h new file mode 100644 index 0000000000..53ffcaf43a --- /dev/null +++ b/paddle/fluid/operators/impl/load_combine.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace impl { + +// Load parameters from a single stream. +void LoadParamsFromStream(const std::vector &out_var_names, + const platform::Place &place, bool load_as_fp16, + std::istream *buffer, const framework::Scope *scope); + +} // namespace impl +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 0522a94195..8cb7c6c9c6 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase { const platform::Place &place) const override { auto filename = Attr("file_path"); auto load_as_fp16 = Attr("load_as_fp16"); - - std::ifstream fin(filename); - PADDLE_ENFORCE(static_cast(fin), - "Cannot open file %s for load_combine op", filename); - + auto is_memory_load = Attr("is_memory_load"); auto out_var_names = Outputs("Out"); PADDLE_ENFORCE_GT( static_cast(out_var_names.size()), 0, "The number of output variables should be greater than 0."); - + if (!is_memory_load) { + std::ifstream fin(filename); + PADDLE_ENFORCE(static_cast(fin), + "Cannot open file %s for load_combine op", filename); + LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); + } else { + PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); + std::stringstream fin(filename); + LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); + } + } + void LoadParamsFromBuffer( + const framework::Scope &scope, const platform::Place &place, + std::istream *buffer, bool load_as_fp16, + const std::vector &out_var_names) const { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); @@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase { auto *tensor = out_var->GetMutable(); // Error checking - PADDLE_ENFORCE(static_cast(fin), "Cannot read more from file %s", - filename); + PADDLE_ENFORCE(static_cast(buffer), "Cannot read more"); // Get data from fin to tensor - DeserializeFromStream(fin, tensor, dev_ctx); + DeserializeFromStream(*buffer, tensor, dev_ctx); auto in_dtype = framework::ToDataType(tensor->type()); auto out_dtype = @@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { "LoDTensors will be loaded from \"file_path\".") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); + AddAttr("is_memory_load", + "(boolean, default false)" + "If true, file_path is in memory, and LoDTensors will be " + "loaded directly from memory") + .SetDefault(false); AddComment(R"DOC( LoadCombine Operator. -LoadCombine operator loads LoDTensor variables from a file. The file should -contain one or more LoDTensors serialized using the SaveCombine operator. The +LoadCombine operator loads LoDTensor variables from a file, which could be +loaded in memory already. The file should contain one or more LoDTensors +serialized using the SaveCombine operator. The LoadCombine operator applies a deserialization strategy to appropriately load the LodTensors, and this strategy complements the serialization strategy used in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled From 42359e88a409822581e42f64d21f675a90441cd3 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Thu, 6 Dec 2018 17:29:27 +0800 Subject: [PATCH 147/719] clean code test=develop --- .../fluid/inference/api/analysis_predictor.cc | 9 +-- paddle/fluid/inference/io.cc | 1 - paddle/fluid/operators/impl/CMakeLists.txt | 1 - paddle/fluid/operators/impl/load_combine.cc | 61 ------------------- paddle/fluid/operators/impl/load_combine.h | 33 ---------- 5 files changed, 5 insertions(+), 100 deletions(-) delete mode 100644 paddle/fluid/operators/impl/CMakeLists.txt delete mode 100644 paddle/fluid/operators/impl/load_combine.cc delete mode 100644 paddle/fluid/operators/impl/load_combine.h diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6091c5a625..f70c12dc87 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -304,7 +304,6 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, // NOTE All the members in AnalysisConfig should be copied to Argument. void AnalysisPredictor::OptimizeInferenceProgram() { - LOG(INFO) << "optimization program"; status_program_optimized_ = true; argument_.SetUseGPU(config_.use_gpu); @@ -313,11 +312,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() { // Analyze inference_program if (!config_.model_dir.empty()) { argument_.SetModelDir(config_.model_dir); - } else if (!config_.param_file.empty() && !config_.prog_file.empty()) { + } else { + PADDLE_ENFORCE( + !config_.param_file.empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file.empty()); argument_.SetModelProgramPath(config_.prog_file); argument_.SetModelParamsPath(config_.param_file); - } else { - PADDLE_THROW("Either model_dir or (param_file, prog_file) should be set."); } if (config_.use_gpu && config_.use_tensorrt_) { diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 053c516f88..060d6a89d4 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/version.h" -#include "paddle/fluid/operators/impl/load_combine.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/pybind/pybind.h" diff --git a/paddle/fluid/operators/impl/CMakeLists.txt b/paddle/fluid/operators/impl/CMakeLists.txt deleted file mode 100644 index 1c0ff7c3d9..0000000000 --- a/paddle/fluid/operators/impl/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(load_combine_impl SRCS load_combine.cc DEPS scope lod_tensor device_context op_registry data_type_transform) diff --git a/paddle/fluid/operators/impl/load_combine.cc b/paddle/fluid/operators/impl/load_combine.cc deleted file mode 100644 index 454d583a10..0000000000 --- a/paddle/fluid/operators/impl/load_combine.cc +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/impl/load_combine.h" - -namespace paddle { -namespace operators { -namespace impl { - -void LoadParamsFromStream(const std::vector &out_var_names, - const paddle::platform::Place &place, - bool load_as_fp16, std::istream *buffer, - const paddle::framework::Scope *scope) { - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - for (size_t i = 0; i < out_var_names.size(); i++) { - auto *out_var = scope->FindVar(out_var_names[i]); - - PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", - out_var_names[i]); - - auto *tensor = out_var->GetMutable(); - - // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, *dev_ctx); - - auto in_dtype = framework::ToDataType(tensor->type()); - auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - out_var->Clear(); - tensor = out_var->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); - } - } -} - -} // namespace impl -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/impl/load_combine.h b/paddle/fluid/operators/impl/load_combine.h deleted file mode 100644 index 53ffcaf43a..0000000000 --- a/paddle/fluid/operators/impl/load_combine.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace operators { -namespace impl { - -// Load parameters from a single stream. -void LoadParamsFromStream(const std::vector &out_var_names, - const platform::Place &place, bool load_as_fp16, - std::istream *buffer, const framework::Scope *scope); - -} // namespace impl -} // namespace operators -} // namespace paddle From 7fa2e821e470411b75ba0f53a3759fa007391745 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 6 Dec 2018 17:53:05 +0800 Subject: [PATCH 148/719] add local scope in nce --- paddle/fluid/operators/nce_op.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 7397d9f473..afb14c3071 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -194,6 +194,8 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] += result(0); sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } + + context.scope().DeleteScope(&local_scope); #endif } else { auto weight_mat = From 93551a3440d719f161b6b39309c90cdd19218d75 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Thu, 6 Dec 2018 17:54:19 +0800 Subject: [PATCH 149/719] update API.spec --- paddle/fluid/API.spec | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2722ea078e..e273a852a9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -32,13 +32,6 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -66,6 +59,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label_pos'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) @@ -76,7 +70,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -182,7 +176,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -194,9 +188,6 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -301,7 +292,6 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -422,17 +412,3 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable -paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) -paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) -paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) -paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) -paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) -paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) -paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) -paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) -paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) -paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) -paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) From b51df398749af98a40d7be49b2a7d1cc7bc3f128 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Thu, 6 Dec 2018 17:58:22 +0800 Subject: [PATCH 150/719] update , test=develop --- python/paddle/fluid/tests/unittests/test_bpr_loss_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index 7e18913a03..2af6461aed 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -25,7 +25,7 @@ class TestBprLossOp1(OpTest): def setUp(self): self.op_type = "bpr_loss" - batch_size = 3 + batch_size = 4 class_num = 5 X = randomize_probability(batch_size, class_num, dtype='float64') label_pos = np.random.randint( From f6a877bc571d1db3d6de544e08e13adb0445dfe6 Mon Sep 17 00:00:00 2001 From: flame Date: Thu, 6 Dec 2018 18:35:08 +0800 Subject: [PATCH 151/719] add tool to visualize inference model (#14621) --- paddle/fluid/inference/utils/CMakeLists.txt | 5 ++ paddle/fluid/inference/utils/visualizer.cc | 92 +++++++++++++++++++++ paddle/fluid/inference/utils/visualizer.h | 42 ++++++++++ 3 files changed, 139 insertions(+) create mode 100644 paddle/fluid/inference/utils/visualizer.cc create mode 100644 paddle/fluid/inference/utils/visualizer.h diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index 2104e4ac72..cfb80fe6ec 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,2 +1,7 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) +cc_binary(visualizer SRCS visualizer.cc DEPS analysis + paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) +if(WIN32) + target_link_libraries(visualizer shlwapi) +endif(WIN32) diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc new file mode 100644 index 0000000000..040b6476fb --- /dev/null +++ b/paddle/fluid/inference/utils/visualizer.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/visualizer.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/platform/init.h" + +DEFINE_string(model_dir, "", "model directory"); +DEFINE_string(model_program_path, "", "model program path"); +DEFINE_string(model_params_path, "", "model params path"); + +USE_PASS(graph_viz_pass); +USE_PASS(graph_to_program_pass); + +using paddle::inference::analysis::Argument; + +namespace paddle { +namespace inference { +namespace utils { + +void Visualizer::SetArgument(Argument *argument) { argument_ = argument; } + +bool Visualizer::Run() { + paddle::framework::InitDevices(false); + paddle::inference::analysis::Analyzer().Run(argument_); + + return true; +} + +} // namespace utils +} // namespace inference +} // namespace paddle + +// Generate a dot file describing the structure of graph. +// To use this tool, run command: ./visualizer [options...] +// Options: +// --model_dir: the directory of model +// --model_program_path: the path of program +// --model_params_path: the path of params +int main(int argc, char *argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + + paddle::inference::analysis::Argument argument; + argument.SetUseGPU(false); + argument.SetUseTensorRT(false); + + if (FLAGS_model_dir.empty()) { + if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) { + LOG(ERROR) << "Please set model_dir" + " or model_program_path and model_params_path"; + return -1; + } else { + argument.SetModelProgramPath(FLAGS_model_program_path); + argument.SetModelParamsPath(FLAGS_model_params_path); + } + } else { + argument.SetModelDir(FLAGS_model_dir); + } + + // Only 1 pass, default filename is 0_ir_origin.dot + // For more details, looking for paddle::inference::analysis::IRPassManager + argument.SetIrAnalysisPasses({"graph_viz_pass"}); + + std::unique_ptr scope{ + new paddle::framework::Scope()}; + argument.SetScopeNotOwned( + const_cast(scope.get())); + + paddle::inference::utils::Visualizer visualizer; + visualizer.SetArgument(&argument); + visualizer.Run(); + + return 0; +} diff --git a/paddle/fluid/inference/utils/visualizer.h b/paddle/fluid/inference/utils/visualizer.h new file mode 100644 index 0000000000..be532f92cf --- /dev/null +++ b/paddle/fluid/inference/utils/visualizer.h @@ -0,0 +1,42 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/inference/analysis/argument.h" + +namespace paddle { +namespace inference { +namespace utils { + +using paddle::inference::analysis::Argument; + +class Visualizer final { + public: + Visualizer() = default; + ~Visualizer() = default; + Visualizer(const Visualizer &) = delete; + Visualizer &operator=(const Visualizer &) = delete; + + void SetArgument(Argument *); + bool Run(); + + private: + Argument *argument_; +}; + +} // namespace utils +} // namespace inference +} // namespace paddle From 5f98d8003966ebda1dd144ce7aee8a996f608f62 Mon Sep 17 00:00:00 2001 From: wangguibao Date: Thu, 6 Dec 2018 20:49:43 +0800 Subject: [PATCH 152/719] AsyncExecutor bugfix: Tensor change to LoDTensor --- paddle/fluid/framework/data_feed.cc | 43 +++++++----------------- paddle/fluid/framework/data_feed.h | 31 +---------------- paddle/fluid/framework/data_feed_test.cc | 15 +++------ 3 files changed, 18 insertions(+), 71 deletions(-) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 291d8ffc3c..9a4b115aee 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -33,11 +33,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) { CheckInit(); for (size_t i = 0; i < use_slots_.size(); ++i) { if (name == use_slots_[i]) { - if (use_slots_is_dense_[i]) { - feed_vec_[i] = MixTensor(var->GetMutable()); - } else { - feed_vec_[i] = MixTensor(var->GetMutable()); - } + feed_vec_[i] = var->GetMutable(); } } } @@ -350,34 +346,21 @@ void MultiSlotDataFeed::PutToFeedVec( int total_instance = static_cast(offset.back()); if (type[0] == 'f') { // float const auto& feasign = ins_vec[i].GetFloatData(); - if (feed_vec_[i].IsDense()) { - int size_in_each_batch = total_instance / batch_size_; - float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( - {batch_size_, size_in_each_batch}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); - } else { - float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); - LoD data_lod{offset}; - feed_vec_[i].GetLoDTensor()->set_lod(data_lod); - } + float* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); } else if (type[0] == 'u') { // uint64 // no uint64_t type in paddlepaddle const auto& feasign = ins_vec[i].GetUint64Data(); - if (feed_vec_[i].IsDense()) { - int size_in_each_batch = total_instance / batch_size_; - int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( - {batch_size_, size_in_each_batch}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); - } else { - int64_t* tensor_ptr = - feed_vec_[i].GetLoDTensor()->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); - LoD data_lod{offset}; - feed_vec_[i].GetLoDTensor()->set_lod(data_lod); - } + int64_t* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + } + LoD data_lod{offset}; + feed_vec_[i]->set_lod(data_lod); + if (use_slots_is_dense_[i]) { + int dim = total_instance / batch_size_; + feed_vec_[i]->Resize({batch_size_, dim}); } } } diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index a7f8d1d317..7cc6919703 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -30,35 +30,6 @@ limitations under the License. */ namespace paddle { namespace framework { -// Pack Tensor type and LoDTensor type into MixTensor type, in order -// to record either Tensor or LoDTensor information at the same time. -class MixTensor { - public: - MixTensor() {} - explicit MixTensor(LoDTensor* lodtensor) { - is_dense_ = false; - lodtensor_ = lodtensor; - } - explicit MixTensor(Tensor* tensor) { - is_dense_ = true; - tensor_ = tensor; - } - bool IsDense() { return is_dense_; } - LoDTensor* GetLoDTensor() { - PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr."); - return lodtensor_; - } - Tensor* GetTensor() { - PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr."); - return tensor_; - } - - private: - bool is_dense_; - LoDTensor* lodtensor_; - Tensor* tensor_; -}; - // DataFeed is the base virtual class for all ohther DataFeeds. // It is used to read files and parse the data for subsequent trainer. // Example: @@ -133,7 +104,7 @@ class DataFeed { use_slots_index_; // -1: not used; >=0: the index of use_slots_ // The data read by DataFeed will be stored here - std::vector feed_vec_; + std::vector feed_vec_; // the batch size defined by user int default_batch_size_; diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc index 3974f8dbad..b3e9698715 100644 --- a/paddle/fluid/framework/data_feed_test.cc +++ b/paddle/fluid/framework/data_feed_test.cc @@ -152,19 +152,13 @@ void GetElemSetFromReader(std::vector* reader_elem_set, const auto& multi_slot_desc = data_feed_desc.multi_slot_desc(); std::map lodtensor_targets; - std::map tensor_targets; for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { const auto& slot = multi_slot_desc.slots(i); if (slot.is_used()) { const auto& name = slot.name(); readers[idx]->AddFeedVar(scope->Var(name), name); - if (slot.is_dense()) { - tensor_targets[name] = - &scope->FindVar(name)->Get(); - } else { - lodtensor_targets[name] = - &scope->FindVar(name)->Get(); - } + lodtensor_targets[name] = + &scope->FindVar(name)->Get(); } } readers[idx]->Start(); @@ -175,8 +169,9 @@ void GetElemSetFromReader(std::vector* reader_elem_set, if (!slot.is_used()) { continue; } + const paddle::framework::LoDTensor* tens = + lodtensor_targets[slot.name()]; if (slot.is_dense()) { // dense branch - const paddle::framework::Tensor* tens = tensor_targets[slot.name()]; if (slot.type() == "uint64") { const int64_t* data = tens->data(); int batch_size = tens->dims()[0]; @@ -202,8 +197,6 @@ void GetElemSetFromReader(std::vector* reader_elem_set, PADDLE_THROW("Error type in proto file."); } } else { // sparse branch - const paddle::framework::LoDTensor* tens = - lodtensor_targets[slot.name()]; if (slot.type() == "uint64") { const int64_t* data = tens->data(); for (size_t i = 0; i < tens->NumElements(); ++i) { From 5a2cd4505b34af88a49ac5ca886b50a2e9b78430 Mon Sep 17 00:00:00 2001 From: wangguibao Date: Thu, 6 Dec 2018 20:55:21 +0800 Subject: [PATCH 153/719] AsyncExecutor bugfix: Tensor to LoDTensor test=develop --- paddle/fluid/framework/data_feed.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 9a4b115aee..a99cf53b41 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -297,6 +297,7 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector* instance) { "the data, please check if the data contains unresolvable " "characters.\nplease check this error line: %s", str); + if (idx != -1) { (*instance)[idx].Init(all_slots_type_[i]); if ((*instance)[idx].GetType()[0] == 'f') { // float @@ -333,6 +334,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec( (*ins_vec)[i].InitOffset(); } } + for (size_t i = 0; i < instance.size(); ++i) { (*ins_vec)[i].AddIns(instance[i]); } @@ -344,6 +346,7 @@ void MultiSlotDataFeed::PutToFeedVec( const auto& type = ins_vec[i].GetType(); const auto& offset = ins_vec[i].GetOffset(); int total_instance = static_cast(offset.back()); + if (type[0] == 'f') { // float const auto& feasign = ins_vec[i].GetFloatData(); float* tensor_ptr = feed_vec_[i]->mutable_data( @@ -356,6 +359,7 @@ void MultiSlotDataFeed::PutToFeedVec( {total_instance, 1}, platform::CPUPlace()); memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); } + LoD data_lod{offset}; feed_vec_[i]->set_lod(data_lod); if (use_slots_is_dense_[i]) { From ce674b685f89a66ef0b9163b76b456a9580e5f41 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 6 Dec 2018 05:24:50 +0000 Subject: [PATCH 154/719] add readme doc and complete TODOs --- paddle/fluid/operators/jitkernels/README.md | 46 ++++++++++++++++++- .../fluid/operators/jitkernels/jitcode_base.h | 6 +-- .../fluid/operators/jitkernels/kernel_base.h | 13 ++++-- .../fluid/operators/jitkernels/kernel_pool.h | 14 +----- .../fluid/operators/jitkernels/more/mkl/mkl.h | 11 +---- .../fluid/operators/jitkernels/refer/refer.h | 4 +- paddle/fluid/operators/jitkernels/registry.h | 2 +- paddle/fluid/operators/jitkernels/test.cc | 8 ++-- 8 files changed, 65 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jitkernels/README.md index 3401e9be53..fd6428b43e 100644 --- a/paddle/fluid/operators/jitkernels/README.md +++ b/paddle/fluid/operators/jitkernels/README.md @@ -1,4 +1,46 @@ -TBD +# JIT Kernel + +结合函数模板和JIT生成需要的kernel函数。 +这里的kernel是比Operator中kernel更小级别的算子单元,更侧重的是在不同硬件上的性能。 +目前仅支持CPU上的高性能计算。 + +## 目录结构 + +```txt +PaddlePaddle/Paddle/paddle/fluid/ +├── ... +├── operator/ +│ ├── .../ +└── jit/ + ├── ... + ├── jitcode/ + │ └── ... + |── more/ + │ ├── ... + │ ├── mkl/ + │ │ └── ... + │ └── openblas/ + │ └── ... + └── refer/ + └── ... +``` + +基础class都的根目录下,根目录下包括jitcode,more和refer。每个目录下都是一种实现,每种kernel算子都需要有reference的实现,其他的都是可选的。 +- jitcode: 代表使用jit生成的code,需要依赖xbyak。他关心的是性能。 +- refer:代表reference的实现,每种kernel算子都需要有在CPU上的reference的实现,他主要关心的算法逻辑。 +- more: 下面可以放入跟多实现,包括mkl,mkldnn,openblas等,也可以是自身已有的kernel组合。 -# Use me +## 动态获取 + +提供一个get方法,根据kernel类别获取,每种实现都有自己的使用范围,根据范围动态和当前条件选择需要的kernel函数。 + +## 测试 + +- 逻辑测试 + 所有实现都要与refer的code对比,需要满足精度要求 +- 性能测试 + +# 如何添加新的算子 +TBD +## Use me Add USE_JIT_KERNEL(yourname) to CMakefile. diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h index ffec62163a..de8aaf229f 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jitkernels/jitcode_base.h @@ -62,11 +62,7 @@ class JitBase : public Kernel { }; template -std::unique_ptr CreateJitCode(Attr attr); //{ -// if (UseJitCode) { -// return make_unique(attr, CodeSize()); -// } -// } +std::unique_ptr CreateJitCode(Attr attr); } // namespace jitkernels } // namespace operators diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jitkernels/kernel_base.h index eeaa0617cb..6fbb0f9f7e 100644 --- a/paddle/fluid/operators/jitkernels/kernel_base.h +++ b/paddle/fluid/operators/jitkernels/kernel_base.h @@ -21,6 +21,13 @@ namespace jitkernels { typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; +template +struct VMulTypes { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, const T*, T*, int); +}; + // Just for adding to kernel pool without template class Kernel { public: @@ -29,10 +36,10 @@ class Kernel { DISABLE_COPY_AND_ASSIGN(Kernel); }; -template // TODO(TJ): use tuple +template class KernelImpl : public Kernel { public: - using ELEMENT_TYPE = T; // TODO(TJ): remove me? + using ELEMENT_TYPE = T; virtual Func GetFunc() const { return func; } virtual bool UseMe(Attr attr) const = 0; @@ -40,7 +47,7 @@ class KernelImpl : public Kernel { Func func{nullptr}; }; -template // TODO(TJ): use tuple +template class ReferKernel : public KernelImpl { public: // Refer code can always be used diff --git a/paddle/fluid/operators/jitkernels/kernel_pool.h b/paddle/fluid/operators/jitkernels/kernel_pool.h index f398093dfe..901a891cb3 100644 --- a/paddle/fluid/operators/jitkernels/kernel_pool.h +++ b/paddle/fluid/operators/jitkernels/kernel_pool.h @@ -27,8 +27,6 @@ namespace paddle { namespace operators { namespace jitkernels { -// TODO(TJ): rename file to kernel_pool - template class JitCodePool { typedef std::unique_ptr JitBasePtr; @@ -54,14 +52,6 @@ class JitCodePool { DISABLE_COPY_AND_ASSIGN(JitCodePool); }; -// TODO(TJ): std::tuple -// template -// struct KernelAttr { -// typedef T data_type; -// typedef Func return_type; -// typedef Attr attr_type; -// }; - typedef std::unique_ptr KernelPtr; typedef std::unordered_map, KernelKey::Hash> KernelMap; @@ -120,7 +110,6 @@ inline Func GetRefer() { return nullptr; } -// TODO(TJ): make tuple? named KernelAttr template const Func Get(Attr attr) { @@ -130,8 +119,7 @@ const Func Get(Attr attr) { return codes.AllKernels().at(key)->template getCode(); } - if (std::is_same::value) { // TODO(TJ): float - // move to create + if (std::is_same::value) { auto p = CreateJitCode(attr); if (p) { auto f = p->template getCode(); diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h index 75ed34ef48..9cf032db43 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h +++ b/paddle/fluid/operators/jitkernels/more/mkl/mkl.h @@ -27,16 +27,9 @@ namespace mkl { template void VMul(const T* x, const T* y, T* z, int n); -// template -// struct VMulTypes{ -// typedef T date_type; -// typedef void (*func)(const T*, const T*, T*, int) func_type; -// typedef int attr_type; -// }; - template -class VMulKernel - : public KernelImpl { +class VMulKernel : public KernelImpl::func_type, + typename VMulTypes::attr_type> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jitkernels/refer/refer.h index 163c6d73dc..796f58d401 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.h +++ b/paddle/fluid/operators/jitkernels/refer/refer.h @@ -29,8 +29,8 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel - : public ReferKernel { +class VMulKernel : public ReferKernel::func_type, + typename VMulTypes::attr_type> { public: VMulKernel() { this->func = VMul; } }; diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jitkernels/registry.h index cd414bb096..6d817461be 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jitkernels/registry.h @@ -26,7 +26,7 @@ namespace paddle { namespace operators { namespace jitkernels { -// make_unique is supported from c++14 +// make_unique is supported since c++14 template inline std::unique_ptr make_unique(Args&&... args) { static_assert(!std::is_array::value, "T must not be array"); diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jitkernels/test.cc index eb0d30eecd..d27b5d1cba 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jitkernels/test.cc @@ -69,10 +69,10 @@ TEST(JitKernel, vmul) { namespace jit = paddle::operators::jitkernels; // TODO(TJ): test more vector size for (int d = 1; d < 30; ++d) { - auto ref = jit::GetRefer(); - auto tgt = jit::Get(d); + auto ref = jit::GetRefer::func_type, + jit::VMulTypes::attr_type>(); + auto tgt = jit::Get::func_type, + jit::VMulTypes::attr_type, PlaceType>(d); EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(tgt != nullptr); From c9de6f1b05aa428d5e6ad9c16db5c2ca8c12cdc7 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 6 Dec 2018 21:16:10 +0800 Subject: [PATCH 155/719] init parallel graph mode --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 + .../framework/details/all_reduce_op_handle.cc | 28 +++- .../fluid/framework/details/build_strategy.cc | 1 + .../details/computation_op_handle.cc | 12 +- .../framework/details/computation_op_handle.h | 1 + .../framework/details/execution_strategy.h | 2 +- .../details/multi_devices_graph_pass.cc | 8 +- .../fluid/framework/details/op_handle_base.cc | 3 +- .../fluid/framework/details/op_handle_base.h | 1 - .../details/parallel_ssa_graph_executor.cc | 66 ++++++++++ .../details/parallel_ssa_graph_executor.h | 51 +++++++ .../scope_buffered_ssa_graph_executor.cc | 41 +++--- .../scope_buffered_ssa_graph_executor.h | 5 +- .../details/threaded_ssa_graph_executor.h | 1 + paddle/fluid/framework/details/var_handle.cc | 2 +- paddle/fluid/framework/parallel_executor.cc | 124 +++++++++++++----- paddle/fluid/framework/parallel_executor.h | 2 + paddle/fluid/framework/scope.cc | 5 +- paddle/fluid/framework/threadpool.cc | 16 ++- paddle/fluid/framework/threadpool.h | 4 +- paddle/fluid/framework/threadpool_test.cc | 44 +++++++ .../fluid/operators/reader/blocking_queue.h | 3 + .../fluid/operators/reader/buffered_reader.cc | 5 + .../reader/create_double_buffer_reader_op.cc | 14 +- .../operators/reader/create_py_reader_op.cc | 2 + .../fluid/operators/reader/open_files_op.cc | 2 + paddle/fluid/platform/nccl_helper.h | 7 +- paddle/fluid/platform/profiler.cc | 12 +- paddle/fluid/pybind/pybind.cc | 24 ++-- 30 files changed, 399 insertions(+), 91 deletions(-) create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c701a2ad63..b419c8c292 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -177,7 +177,7 @@ else() endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fe..6524753322 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,6 +54,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) + cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index e8bf53e160..ae17ea8a15 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,20 +46,27 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { + int64_t start_ts = GetTS(); + int64_t func_ts = GetTS(); + VLOG(5) << "all_reduce_op_handle::RunImpl start"; platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA if (NoDummyInputSize() == 1 && - local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { + local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else if (NoDummyInputSize() == 1) { #endif return; // No need to all reduce when GPU count = 1; } else { // Wait input done + start_ts = GetTS(); WaitInputVarGenerated(); + VLOG(5) << "all_reduce_op_handle wait input var spent: " + << GetTS() - start_ts << " (ns)."; + start_ts = GetTS(); auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( @@ -100,6 +107,8 @@ void AllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; + VLOG(5) << "call allreduce: " << in_var_handles[i]->name_ + << " on dev: " << dev_id; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -110,11 +119,20 @@ void AllReduceOpHandle::RunImpl() { }); } this->RunAndRecordEvent([&] { - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); + // TODO(Yancey1989): need allreduce operator to avoid this flag + if (nccl_ctxs_->need_group_call_) { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } else { + // only used in executor_type == ParallalGraph, one thread one GPU + // TODO(Yancey1989): use allreduce operator to avoid this tricky. + PADDLE_ENFORCE(all_reduce_calls.size() == 1UL); + all_reduce_calls[0](); } }); + #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -144,6 +162,8 @@ void AllReduceOpHandle::RunImpl() { } } } + VLOG(5) << "all_reduce_op_handle Impl spent: " << GetTS() - func_ts + << " (ns)."; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 1e1b945f63..04c1061536 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -118,6 +118,7 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + VLOG(5) << "run pass: " << pass->Type(); if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); pass->SetNotOwned>("places", &places); diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c60..35ba99a879 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -33,10 +33,18 @@ void ComputationOpHandle::RunImpl() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; - if (is_lock_and_record_event_free_) { + if (Name().compare("conv2d") || Name().compare("conv2d_grad")) { + int64_t start_ts = GetTS(); + auto varname = DynamicCast(this->Outputs())[0]->name_; run_func(); + VLOG(5) << Name() << "_op_handle: " << varname + << " spent: " << GetTS() - start_ts << " (ns)."; } else { - this->RunAndRecordEvent(run_func); + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } } } diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 662a91d6b4..5346b56dd6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c..d3d5b6bf54 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,7 +20,7 @@ namespace framework { namespace details { struct ExecutionStrategy { - enum ExecutorType { kDefault = 0, kExperimental = 1 }; + enum ExecutorType { kDefault = 0, kExperimental = 1, kParallelGraph = 2 }; size_t num_threads_{0}; bool use_cuda_{true}; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index cbae5321d9..1bd238357a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -300,7 +300,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - int num_trainers = Get(kNumTrainers); + // int num_trainers = Get(kNumTrainers); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -329,6 +329,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { + VLOG(5) << "op name: " << node->Op()->Type(); if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { @@ -365,9 +366,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // is true only for the op that scale the final scalar loss. // It also assumes backward op will always follow the forward op in // the block. + VLOG(5) << "this is loss scale op!"; is_forwarding = false; } else { int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); + VLOG(5) << "on device id: " << op_dev_id; if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { @@ -386,7 +389,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + if (!is_forwarding && nccl_ctxs_->contexts_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3..d68d1ce71d 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,6 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA + int64_t start_ts = 0; if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; @@ -52,7 +53,6 @@ void OpHandleBase::Run(bool use_cuda) { #else PADDLE_ENFORCE(!use_cuda); #endif - RunImpl(); } @@ -125,6 +125,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event + VLOG(5) << "events not empty"; std::function method = callback; for (auto &p : dev_ctxes_) { method = [method, p, this]() { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c61..88c78e0678 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -26,7 +26,6 @@ namespace framework { namespace details { constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; - // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. class OpHandleBase { diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc new file mode 100644 index 0000000000..72beb74aa4 --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, + std::vector> graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + places_(std::move(places)), + graphs_(std::move(graphs)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + for (size_t i = 0; i < places.size(); ++i) { + std::vector scopes = {local_scopes_[i]}; + std::vector places = {places_[i]}; + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, scopes, places, std::move(graphs_[i]))); + } +} + +FeedFetchList ParallelSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::vector> run_futures; + FeedFetchList fetch_data; + + for (size_t i = 0; i < places_.size(); ++i) { + auto call = [this, i] { + // FIXME(Yancey1989): need to fix fetch data failed. + std::vector empty; + executors_[i]->Run(empty); + }; + if (pool_) { + run_futures.emplace_back(pool_->enqueue(std::move(call))); + } else { + call(); + } + } + if (pool_) { + for (auto &f : run_futures) { + f.wait(); + } + } + return fetch_data; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h new file mode 100644 index 0000000000..c0ba1577f7 --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelSSAGraphExecutor : public SSAGraphExecutor { + public: + ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::vector> graphs); + ~ParallelSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::vector places_; + std::vector> graphs_; + std::unique_ptr<::ThreadPool> pool_; + + std::vector> executors_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 499246a985..abc6b9f559 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -27,39 +27,40 @@ namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector var_infos, std::vector places, + std::vector> var_infos_list, + std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), - var_infos_(std::move(var_infos)), + var_infos_list_(std::move(var_infos_list)), places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { // Create local scopes. - for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { - auto &scope = *it; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &scope = local_scopes_[i]; Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; - - for (auto &info : var_infos_) { - if (scope->FindVar(info.name_) != nullptr) { - continue; - } - - if (info.persistable_) { // Persistable - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - InitializeVariable(local_scope.Var(info.name_), info.type_); + for (auto &var_infos : var_infos_list_) { + for (auto &info : var_infos) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); + } } } } } std::vector fetch_data; - std::exception_ptr eptr; + std::exception_ptr eptr = nullptr; try { fetch_data = underlying_executor_->Run(fetch_tensors); } catch (...) { @@ -71,9 +72,13 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( #ifdef PADDLE_WITH_CUDA const std::string gc_name = "garbage_collector"; - DeviceGarbageCollectorMap *gc = - Graph().Has(gc_name) ? &(Graph().Get(gc_name)) - : nullptr; + DeviceGarbageCollectorMap *gc = nullptr; + // FIXME(Yancey1989): need to fix gc failed on parallel graph mode + if (strategy_.type_ != ExecutionStrategy::kParallelGraph) { + gc = Graph().Has(gc_name) + ? &(Graph().Get(gc_name)) + : nullptr; + } #endif if (!fetch_tensors.empty() || diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50..51230d4a42 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -38,7 +38,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector var_infos, std::vector places, + std::vector> var_info_list, + std::vector places, std::unique_ptr&& underlying_executor); const ir::Graph& Graph() const override { @@ -53,7 +54,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; std::vector local_scopes_; - std::vector var_infos_; + std::vector> var_infos_list_; std::vector places_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 24da56c09e..b45afbc046 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,7 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 30da029ca2..7de6025a28 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -20,7 +20,7 @@ namespace details { VarHandleBase::~VarHandleBase() {} -VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } +VarHandle::~VarHandle() { VLOG(5) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee77..ff3d76fb01 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -53,6 +54,7 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; // not owned std::unique_ptr executor_; + std::vector> executors_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; @@ -84,6 +86,9 @@ ParallelExecutor::ParallelExecutor( PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," "the number of places must be greater than 1."); + PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, + "You should set build_strategy.reduce with 'AllReduce' for " + "ParallelGraph executor type"); } // Step 1. Bcast the params to devs. @@ -106,31 +111,55 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - ncclUniqueId *nccl_id = nullptr; + std::unique_ptr nccl_id = nullptr; + bool need_group_call = true; if (nccl_id_var != nullptr) { - nccl_id = nccl_id_var->GetMutable(); + nccl_id.reset(nccl_id_var->GetMutable()); + } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + nccl_id.reset(new ncclUniqueId()); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id.get())); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id.get(); + need_group_call = false; + } else { + // init nccl_id in NCCLContextMap } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + member_->places_, nccl_id.get(), num_trainers, trainer_id, + need_group_call)); #else PADDLE_THROW("Not compiled with CUDA"); #endif } - if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToDevices(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, params, + {member_->local_scopes_[i]}, member_->use_cuda_, + member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } + } else { + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } auto max_memory_size = GetEagerDeletionThreshold(); - if (max_memory_size >= 0) { + // FIXME(Yancey1989): need to fix on parallel graph mode + if (max_memory_size >= 0 && + exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { for (auto &place : member_->places_) { if (!platform::is_gpu_place(place)) continue; auto gpu_place = boost::get(place); @@ -143,40 +172,48 @@ ParallelExecutor::ParallelExecutor( } } if (!gcs_.empty()) { - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = ref_cnt_pass->Apply(std::move(graph)); - graph->SetNotOwned("garbage_collector", &gcs_); + for (size_t i = 0; i < graphs.size(); ++i) { + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graphs[0] = ref_cnt_pass->Apply(std::move(graphs[i])); + graphs[0]->SetNotOwned("garbage_collector", &gcs_); + } } } #else std::unique_ptr graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); + graphs.push_back(std::move(graph)); #endif // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars - std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); + std::vector> var_infos_list; + for (size_t i = 0; i < graphs.size(); ++i) { + std::vector var_infos; + for (auto &node : graphs[i]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } } + var_infos_list.emplace_back(std::move(var_infos)); } + // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graph); + size_t graph_num = ir::GraphNum(*graphs[0]); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graph) + << ir::GraphNum(*graphs[0]) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -185,15 +222,42 @@ ParallelExecutor::ParallelExecutor( } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + /** + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::vector var_infos; + for (auto &node : graphs[i]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } + } + + std::vector places = {member_->places_[i]}; + std::vector scopes = {member_->local_scopes_[i]}; + std::unique_ptr p(new + details::ThreadedSSAGraphExecutor( + exec_strategy, scopes, places, std::move(graphs[i]))); + + member_->executors_.push_back(std::move(p)); + + member_->executors_[i].reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, scopes, std::move(var_infos), places, + std::move(member_->executors_[i]))); + }**/ member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + member_->executor_.reset(new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, places, graphs)); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos), + exec_strategy, member_->local_scopes_, std::move(var_infos_list), member_->places_, std::move(member_->executor_))); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ef09b98b2a..319701f1eb 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -20,6 +20,8 @@ limitations under the License. */ #include #include +#include "ThreadPool.h" + #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/executor.h" diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7cc..873f68e42e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -58,7 +58,10 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } -Scope::~Scope() { DropKids(); } +Scope::~Scope() { + VLOG(5) << "~Scope()"; + DropKids(); +} Scope& Scope::NewScope() const { SCOPE_LOCK_GUARD diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index fcec955360..7dc7430c55 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -48,9 +48,18 @@ void ThreadPool::Init() { ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); - for (auto& thread : threads_) { + for (int i = 0; i < num_threads; ++i) { + // for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number - thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + threads_[i].reset( + new std::thread(std::bind(&ThreadPool::TaskLoop, this, i))); + /** + sched_param sch; + int policy; + pthread_getschedparam(threads_[i]->native_handle(), &policy, &sch); + if (pthread_setschedparam(threads_[i]->native_handle(), SCHED_FIFO, &sch)) { + VLOG(1) << "Failed to setschedparam: " << errno; + }**/ } } @@ -68,7 +77,7 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::TaskLoop() { +void ThreadPool::TaskLoop(int i) { while (true) { Task task; @@ -89,7 +98,6 @@ void ThreadPool::TaskLoop() { task = std::move(tasks_.front()); tasks_.pop(); } - // run the task task(); } diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 7a51d18fbb..bd8c3cdee8 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include // NOLINT #include #include // NOLINT @@ -27,7 +28,6 @@ limitations under the License. */ namespace paddle { namespace framework { - struct ExceptionHandler { mutable std::future> future_; explicit ExceptionHandler( @@ -99,7 +99,7 @@ class ThreadPool { // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. - void TaskLoop(); + void TaskLoop(int i); // Init is called by GetInstance. static void Init(); diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 884d61e234..1257a76e3e 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -59,3 +59,47 @@ TEST(ThreadPool, ConcurrentRun) { } EXPECT_EQ(sum, ((n + 1) * n) / 2); } +static int64_t GetTS() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000000 + tp.tv_usec; +} + +void multi_call(std::function call) { + for (int i = 0; i < 500; ++i) { + call(); + } +} + +TEST(ThreadPool, PERFORMANCE) { + auto sum = [] { + int a = 0; + for (int i = 0; i < 1000; ++i) { + a += i; + } + }; + // framework::ThreadPool *pool = new framework::ThreadPool(2); + int64_t start = GetTS(); + for (int i = 0; i < 1000; ++i) { + // int64_t s = GetTS(); + framework::Async(std::move(sum)); + // pool->Run(std::move(sum)); + // VLOG(5) << "push to pool spent : " << GetTS() - s << " (us)."; + } + VLOG(5) << "pool spent: " << GetTS() - start << " (us)."; + start = GetTS(); + for (int i = 0; i < 1000; ++i) { + sum(); + } + VLOG(5) << "sequence call spent: " << GetTS() - start << " (us)."; + std::vector threads; + start = GetTS(); + for (int i = 0; i < 2; ++i) { + std::thread t(multi_call, std::ref(sum)); + threads.push_back(std::move(t)); + } + for (auto& thread : threads) { + thread.join(); + } + VLOG(5) << "two threads spent: " << GetTS() - start << " (us)."; +} diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5..10de11bfa5 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -67,9 +67,12 @@ class BlockingQueue { } bool Receive(T* elem) { + VLOG(1) << "blocking queue::Receive ..."; std::unique_lock lock(mutex_); receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; }); + VLOG(1) << "queue_.empty()=" << queue_.empty(); if (!queue_.empty()) { + if (elem == nullptr) VLOG(1) << "elem is nullptr"; PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); if (LIKELY(!speed_test_mode_)) { diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 26ff221dfa..2d66000f1f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -58,7 +58,9 @@ void BufferedReader::ReadAsync(size_t i) { TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { + VLOG(1) << "launch tensor copy from cpu to cpu, idx: " << i; framework::TensorCopySync(cpu[i], place_, &gpu[i]); + VLOG(1) << "done " << i; gpu[i].set_lod(cpu[i].lod()); } } @@ -80,11 +82,13 @@ void BufferedReader::StartImpl() { } void BufferedReader::ReadNextImpl(std::vector *out) { + VLOG(1) << "ReadNextImpl start on place: " << place_; if (position_.empty()) { out->clear(); return; } size_t i = position_.front().get(); + VLOG(1) << "position front: " << i; position_.pop(); if (i == -1UL) { @@ -101,6 +105,7 @@ void BufferedReader::ReadNextImpl(std::vector *out) { ReadAsync(prev_pos_); } prev_pos_ = i; + VLOG(1) << "success ReadNextImpl"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index ed719f91d0..924c92e0bf 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,9 +25,15 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); + VLOG(1) << "find var in scope: " << &scope; + auto* out_var = scope.FindVar(Output("Out")); + VLOG(1) << "var " << Output("Out") << " -> " << out_var; + auto* out = out_var->GetMutable(); + + // auto* out = scope.Var(Output("Out")) + // ->template GetMutable(); if (out->Get() != nullptr) { + VLOG(1) << Output("Out") << " is not nullptr."; return; } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) @@ -46,9 +52,11 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } - + VLOG(1) << "create buffered reader on " << place; out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); + VLOG(1) << "Reset Buffered Reader in var: " + << scope.FindVar(Input("UnderlyingReader")); } }; diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 901a92ab5b..093b0e56b3 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -28,8 +28,10 @@ class PyReader : public framework::FileReader { } void ReadNext(std::vector* out) override { + VLOG(1) << "come in PyReader::ReadNext function, out: " << out; bool success; *out = queue_->Pop(&success); + VLOG(1) << "call PyReader::ReadNext " << success; if (!success) out->clear(); } diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index 38223e0699..ae37a18725 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -115,10 +115,12 @@ class PreemptiveReaderContainer : public IReaderContainer { } void ReadNext(std::vector* out) override { + VLOG(1) << "flag"; if (!pending_.empty()) { auto future_it = complete_queue_.Pop(); FutureItem item = future_it->get(); if (item.exception_) { + VLOG(1) << "item has exception!!!"; for (auto it = futures_.begin(); it != futures_.end(); ++it) { if (it != future_it) { it->wait(); // Wait all other threads complete. diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 7c539d25f6..53de53f43d 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -82,12 +82,15 @@ struct NCCLContext { struct NCCLContextMap { std::unordered_map contexts_; std::vector order_; + bool need_group_call_; explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, - size_t num_trainers = 1, size_t trainer_id = 0) { + size_t num_trainers = 1, size_t trainer_id = 0, + bool need_group_call = true) { PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); + need_group_call_ = need_group_call; for (auto &p : places) { int dev_id = boost::get(p).device; order_.emplace_back(dev_id); @@ -102,7 +105,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1) { + if (num_trainers == 1 && nccl_id != nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 998242fb4a..040a68f672 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/port.h" - #include #include #include @@ -25,9 +22,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA + #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) : is_enabled_(false), start_ns_(PosixInNsec()) { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; + std::lock_guard l(profiler_mu); + is_enabled_ = true; dev_ctx_ = dev_ctx; name_ = name; @@ -184,8 +185,9 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; + VLOG(5) << "call ~RecordEvent"; + std::lock_guard l(profiler_mu); DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d297..c313ed2a8b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -720,6 +720,11 @@ All parameter, weight, gradient are variables in Paddle. )DOC"); + py::enum_(exec_strategy, "ExecutorType") + .value("Default", ExecutionStrategy::ExecutorType::kDefault) + .value("Experimental", ExecutionStrategy::ExecutorType::kExperimental) + .value("ParallelGraph", ExecutionStrategy::ExecutorType::kParallelGraph); + exec_strategy.def(py::init()) .def_property( "num_threads", @@ -777,17 +782,14 @@ All parameter, weight, gradient are variables in Paddle. [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { self.dry_run_ = dry_run; - }); - - exec_strategy.def_property( - "use_experimental_executor", - [](const ExecutionStrategy &self) { - return self.type_ == ExecutionStrategy::kExperimental; - }, - [](ExecutionStrategy &self, bool experimental) { - self.type_ = experimental ? ExecutionStrategy::kExperimental - : ExecutionStrategy::kDefault; - }); + }) + .def_property( + "executor_type", + [](const ExecutionStrategy &self) { return self.type_; }, + [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { + self.type_ = type; + }, + R"DOC()DOC"); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to From 743cb840f1235911d163eb4927161cbfc0e803b5 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Thu, 6 Dec 2018 21:43:05 +0800 Subject: [PATCH 156/719] update with comments test=develop --- .../fluid/framework/executor_thread_worker.cc | 2 +- paddle/fluid/inference/analysis/argument.h | 2 +- .../analysis/passes/ir_graph_build_pass.cc | 10 +++-- .../analysis/passes/ir_graph_build_pass.h | 2 +- paddle/fluid/inference/api/analysis_config.cc | 13 ++++--- .../fluid/inference/api/analysis_predictor.cc | 7 ++-- .../inference/api/paddle_analysis_config.h | 10 ++--- paddle/fluid/inference/io.cc | 39 ++++++++++--------- paddle/fluid/inference/io.h | 11 +++--- .../tests/api/analyzer_ner_tester.cc | 4 +- .../inference/tests/api/config_printer.h | 2 +- paddle/fluid/operators/load_combine_op.cc | 6 +-- 12 files changed, 56 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index e3849931d4..3d53511615 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() { static unsigned concurrency_cap = std::thread::hardware_concurrency(); int thread_id = this->thread_id_; - if ((unsigned)thread_id < concurrency_cap) { + if (static_cast(thread_id) < concurrency_cap) { unsigned proc = thread_id; cpu_set_t mask; diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index d205465abf..53cc7039f2 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -103,7 +103,7 @@ struct Argument { // Model specified with program and parameters files. DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); - DECL_ARGUMENT_FIELD(is_memory_load, IsMemoryLoad, bool); + DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index 31138d7342..b8a045c18f 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { argument->model_params_path_valid()) { auto program = LoadModel(argument->model_program_path(), argument->model_params_path(), - argument->scope_ptr(), place, argument->is_memory_load()); + argument->scope_ptr(), place, argument->model_from_memory()); argument->SetMainProgram(program.release()); } else { PADDLE_THROW( @@ -69,9 +69,13 @@ std::unique_ptr IrGraphBuildPass::LoadModel( std::unique_ptr IrGraphBuildPass::LoadModel( const std::string &program_path, const std::string ¶ms_path, framework::Scope *scope, const platform::Place &place, - bool is_memory_load) { + bool model_from_memory) { framework::Executor exe(place); - return Load(&exe, scope, program_path, params_path, is_memory_load); + if (!model_from_memory) { + return Load(&exe, scope, program_path, params_path); + } else { + return LoadFromMemory(&exe, scope, program_path, params_path); + } } std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; } diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h index ae7de676d6..adbde0433f 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -39,7 +39,7 @@ class IrGraphBuildPass : public AnalysisPass { std::unique_ptr LoadModel( const std::string &program_path, const std::string ¶ms_path, framework::Scope *scope, const platform::Place &place, - bool is_memory_load); + bool model_from_memory); std::string model_binary_str_; }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index bfa1f1908c..384d1dc27d 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -53,7 +53,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - is_memory_load_ = other.is_memory_load_; + model_from_memory_ = other.model_from_memory_; if (use_gpu) { pass_builder_.reset(new GpuPassStrategy( @@ -81,7 +81,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - is_memory_load_ = other.is_memory_load_; + model_from_memory_ = other.model_from_memory_; pass_builder_ = std::move(other.pass_builder_); } @@ -105,12 +105,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); } -void contrib::AnalysisConfig::SetProgBufferAndParamBuffer( - const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, - size_t param_buffer_size) { +void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, + size_t prog_buffer_size, + const char *param_buffer, + size_t param_buffer_size) { prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); param_file = std::string(param_buffer, param_buffer + param_buffer_size); - is_memory_load_ = true; + model_from_memory_ = true; } } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index f70c12dc87..84f7eca057 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -308,7 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu); argument_.SetGPUDeviceId(config_.device); - argument_.SetIsMemoryLoad(config_.is_memory_load_); + argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir.empty()) { argument_.SetModelDir(config_.model_dir); @@ -451,11 +451,12 @@ bool AnalysisPredictor::LoadProgramDesc() { // Create ProgramDesc framework::proto::ProgramDesc proto; - if (!config_.is_memory_load()) { + if (!config_.model_from_memory()) { std::string pb_content; // Read binary std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); + PADDLE_ENFORCE(static_cast(fin.is_open()), "Cannot open file %s", + filename); fin.seekg(0, std::ios::end); pb_content.resize(fin.tellg()); fin.seekg(0, std::ios::beg); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 4e9d5bc329..a08e3d027e 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -55,11 +55,9 @@ struct AnalysisConfig : public NativeConfig { bool use_mkldnn() const { return use_mkldnn_; } // Specify the memory buffer of program and parameter - void SetProgBufferAndParamBuffer(const char* prog_buffer, - size_t prog_buffer_size, - const char* program_buffer, - size_t program_buffer_size); - bool is_memory_load() const { return is_memory_load_; } + void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, + const char* program_buffer, size_t program_buffer_size); + bool model_from_memory() const { return model_from_memory_; } friend class ::paddle::AnalysisPredictor; @@ -69,7 +67,7 @@ struct AnalysisConfig : public NativeConfig { int tensorrt_workspace_size_; int tensorrt_max_batchsize_; std::unique_ptr pass_builder_; - bool is_memory_load_{false}; + bool model_from_memory_{false}; }; // Configurations for Anakin engine. diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 060d6a89d4..24d15f12f9 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -70,7 +70,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, const framework::ProgramDesc& main_program, const std::string& dirname, const std::string& param_filename, - bool is_memory_load = false) { + bool model_from_memory = false) { const framework::BlockDesc& global_block = main_program.Block(0); framework::ProgramDesc* load_program = new framework::ProgramDesc(); @@ -109,7 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, op->SetType("load_combine"); op->SetOutput("Out", paramlist); op->SetAttr("file_path", {param_filename}); - op->SetAttr("is_memory_load", {is_memory_load}); + op->SetAttr("model_from_memory", {model_from_memory}); op->CheckAttrs(); } @@ -132,23 +132,17 @@ std::unique_ptr Load(framework::Executor* executor, "model version %ld is not supported.", main_program->Version()); - // is_memory_load is false in seperate parameters. + // model_from_memory is false in seperate parameters. LoadPersistables(executor, scope, *main_program, dirname, "", - false /* is_memory_load */); + false /* model_from_memory */); return main_program; } -std::unique_ptr Load(framework::Executor* executor, - framework::Scope* scope, - const std::string& prog_filename, - const std::string& param_filename, - bool is_memory_load = false) { +std::unique_ptr Load( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_filename, const std::string& param_filename) { std::string program_desc_str; - if (!is_memory_load) { - ReadBinaryFile(prog_filename, &program_desc_str); - } else { - program_desc_str = prog_filename; - } + ReadBinaryFile(prog_filename, &program_desc_str); std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); @@ -157,15 +151,22 @@ std::unique_ptr Load(framework::Executor* executor, main_program->Version()); LoadPersistables(executor, scope, *main_program, "", param_filename, - is_memory_load); + false /* model_from_memory */); return main_program; } -std::unique_ptr Load( +std::unique_ptr LoadFromMemory( framework::Executor* executor, framework::Scope* scope, - const std::string& prog_filename, const std::string& param_filename) { - return Load(executor, scope, prog_filename, param_filename, - false /* is_memory_load */); + const std::string& prog_buffer, const std::string& param_buffer) { + std::unique_ptr main_program( + new framework::ProgramDesc(prog_buffer)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); + + LoadPersistables(executor, scope, *main_program, "", param_buffer, + true /* model_filename */); + return main_program; } void SaveVars(const framework::Scope& scope, diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index 20d635575d..317ef9d93a 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -30,7 +30,8 @@ void Init(const std::vector argv); void LoadPersistables(framework::Executor* executor, framework::Scope* scope, const framework::ProgramDesc& main_program, const std::string& dirname, - const std::string& param_filename, bool is_memory_load); + const std::string& param_filename, + bool model_from_memory); std::unique_ptr Load(framework::Executor* executor, framework::Scope* scope, @@ -41,11 +42,9 @@ std::unique_ptr Load(framework::Executor* executor, const std::string& prog_filename, const std::string& param_filename); -std::unique_ptr Load(framework::Executor* executor, - framework::Scope* scope, - const std::string& prog_filename, - const std::string& param_filename, - bool is_memory_load); +std::unique_ptr LoadFromMemory( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_buffer, const std::string& param_buffer); // Save the variables from a scope to disk. void SaveVars(const framework::Scope& scope, diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index aaa94c8937..66d85420c5 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -98,8 +98,8 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { std::string buffer_prog, buffer_param; ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog); ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param); - cfg->SetProgBufferAndParamBuffer(&buffer_prog[0], buffer_prog.size(), - &buffer_param[0], buffer_param.size()); + cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], + buffer_param.size()); } else { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index c07a27674e..7046bce303 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -63,7 +63,7 @@ std::ostream &operator<<(std::ostream &os, os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; num_spaces++; os << *reinterpret_cast(&config); - if (!config.is_memory_load()) { + if (!config.model_from_memory()) { os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; } else { diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 8cb7c6c9c6..9d1423915a 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -32,12 +32,12 @@ class LoadCombineOp : public framework::OperatorBase { const platform::Place &place) const override { auto filename = Attr("file_path"); auto load_as_fp16 = Attr("load_as_fp16"); - auto is_memory_load = Attr("is_memory_load"); + auto model_from_memory = Attr("model_from_memory"); auto out_var_names = Outputs("Out"); PADDLE_ENFORCE_GT( static_cast(out_var_names.size()), 0, "The number of output variables should be greater than 0."); - if (!is_memory_load) { + if (!model_from_memory) { std::ifstream fin(filename); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load_combine op", filename); @@ -112,7 +112,7 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { "LoDTensors will be loaded from \"file_path\".") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); - AddAttr("is_memory_load", + AddAttr("model_from_memory", "(boolean, default false)" "If true, file_path is in memory, and LoDTensors will be " "loaded directly from memory") From 42d1b3f7861becfe5f8179cf8c0902b006b5500c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 6 Dec 2018 23:22:38 +0800 Subject: [PATCH 157/719] Fix numpy bug on Ubuntu16 and Ubuntu18 which will cause segmentfault test=develop --- python/paddle/dataset/image.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 19fc229e6f..645d54a260 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -32,11 +32,27 @@ the image layout as follows. from __future__ import print_function +import six import numpy as np -try: - import cv2 -except ImportError: - cv2 = None +# NOTE(minqiyang): this is an ugly fix for the numpy bug reported here +# https://github.com/numpy/numpy/issues/12497 +if six.PY3: + import subprocess + import sys + import_cv2_proc = subprocess.Popen([sys.executable, "-c", "import cv2"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = import_cv2_proc.communicate() + retcode = import_cv2_proc.poll() + if retcode != 0: + cv2 = None + else: + import cv2 +else: + try: + import cv2 + except ImportError: + cv2 = None import os import tarfile import six.moves.cPickle as pickle From 6217f42ab70a731034c7a8ca8ea885fc4807bcff Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 6 Dec 2018 23:28:45 +0800 Subject: [PATCH 158/719] Revert "Imperative" --- paddle/fluid/CMakeLists.txt | 1 - paddle/fluid/framework/feed_fetch_method.cc | 9 - paddle/fluid/framework/feed_fetch_method.h | 2 - paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/imperative/CMakeLists.txt | 3 - paddle/fluid/imperative/engine.cc | 53 ----- paddle/fluid/imperative/engine.h | 39 ---- paddle/fluid/imperative/layer.cc | 221 ------------------ paddle/fluid/imperative/layer.h | 102 -------- paddle/fluid/imperative/tracer.cc | 19 -- paddle/fluid/imperative/tracer.h | 128 ---------- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/imperative.cc | 36 --- paddle/fluid/pybind/imperative.h | 53 ----- paddle/fluid/pybind/pybind.cc | 39 ---- python/paddle/fluid/__init__.py | 2 - python/paddle/fluid/framework.py | 54 +---- python/paddle/fluid/imperative/__init__.py | 25 -- python/paddle/fluid/imperative/base.py | 56 ----- python/paddle/fluid/imperative/layers.py | 44 ---- python/paddle/fluid/layer_helper.py | 23 +- python/paddle/fluid/layers/nn.py | 3 +- .../fluid/tests/unittests/test_imperative.py | 52 ----- python/setup.py.in | 1 - tools/print_signatures.py | 4 - 25 files changed, 19 insertions(+), 960 deletions(-) delete mode 100644 paddle/fluid/imperative/CMakeLists.txt delete mode 100644 paddle/fluid/imperative/engine.cc delete mode 100644 paddle/fluid/imperative/engine.h delete mode 100644 paddle/fluid/imperative/layer.cc delete mode 100644 paddle/fluid/imperative/layer.h delete mode 100644 paddle/fluid/imperative/tracer.cc delete mode 100644 paddle/fluid/imperative/tracer.h delete mode 100644 paddle/fluid/pybind/imperative.cc delete mode 100644 paddle/fluid/pybind/imperative.h delete mode 100644 python/paddle/fluid/imperative/__init__.py delete mode 100644 python/paddle/fluid/imperative/base.py delete mode 100644 python/paddle/fluid/imperative/layers.py delete mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 595454e90b..6b526f0103 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) -add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 6338be75a4..3e9353f5cf 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,9 +16,7 @@ limitations under the License. */ #include #include #include "glog/logging.h" -#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -55,12 +53,5 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } -LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { - Variable* var = scope.FindVar(var_name); - PADDLE_ENFORCE(var, "%s no in scope", var_name); - PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); - return *var->GetMutable(); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 031f8e01aa..7f504bfd23 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,7 +27,5 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); -LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe2..fc91564bba 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,8 +38,9 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); + LOG(ERROR) + << "Cannot add backward operator before forward operator %s." + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt deleted file mode 100644 index 373d292b44..0000000000 --- a/paddle/fluid/imperative/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -cc_library(layer SRCS layer.cc DEPS proto_desc operator) -cc_library(tracer SRCS tracer.cc DEPS proto_desc) -cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc deleted file mode 100644 index de7ab0e591..0000000000 --- a/paddle/fluid/imperative/engine.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/engine.h" - -#include // NOLINT -#include - -#include "glog/logging.h" - -namespace paddle { -namespace imperative { - -static std::once_flag init_engine; -static Engine* engine; - -class DummyEngine : public Engine { - public: - void Enqueue(Runnable* runnable) override { - queued_runnables_.push_back(runnable); - } - - size_t Size() const override { return queued_runnables_.size(); } - - void Sync() override { - for (Runnable* l : queued_runnables_) { - LOG(INFO) << "running " << reinterpret_cast(l); - } - queued_runnables_.clear(); - } - - private: - std::vector queued_runnables_; -}; - -Engine* GetEngine() { - std::call_once(init_engine, []() { engine = new DummyEngine(); }); - return engine; -} - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h deleted file mode 100644 index a1dfa5bda3..0000000000 --- a/paddle/fluid/imperative/engine.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -namespace paddle { -namespace imperative { - -struct Runnable {}; - -class Engine { - public: - virtual ~Engine() {} - - virtual void Enqueue(Runnable* runnable) = 0; - - virtual size_t Size() const = 0; - - virtual void Sync() = 0; -}; - -Engine* GetEngine(); - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc deleted file mode 100644 index 6125037680..0000000000 --- a/paddle/fluid/imperative/layer.cc +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/layer.h" -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/string/printf.h" - -namespace paddle { -namespace imperative { - -using framework::Variable; - -void AddTo(Variable* src, Variable* dst) { - framework::LoDTensor* dst_tensor = dst->GetMutable(); - framework::LoDTensor* src_tensor = src->GetMutable(); - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", - dst_tensor->numel(), src_tensor->numel()); - float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); - const float* src_data = src_tensor->data(); - for (size_t i = 0; i < src_tensor->numel(); ++i) { - dst_data[i] += src_data[i]; - } -} - -class Autograd { - public: - explicit Autograd(framework::Scope* scope) : scope_(scope) {} - - void RunBackward(VarBase* var) { - PADDLE_ENFORCE(var->pre_op_->op_desc_); - // TODO(panyx0718): Only create for vars that "require_grad" - (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; - - std::deque ready; - ready.push_back(var->pre_op_); - - std::map dep_counts = ComputeDepCounts(var->pre_op_); - - while (!ready.empty()) { - OpBase* ready_op = ready.front(); - ready.pop_front(); - std::vector input_grads = ready_op->ApplyGrad(scope_); - - for (size_t i = 0; i < input_grads.size(); ++i) { - if (!input_grads[i]) continue; - OpBase* pre_op = ready_op->pre_ops_->at(i); - if (!pre_op) continue; - - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); - } - } - } - } - - private: - std::map ComputeDepCounts(OpBase* op) { - std::map ret; - - std::deque queue; - queue.push_back(op); - std::unordered_set visited; - visited.insert(op); - while (!queue.empty()) { - OpBase* candidate = queue.front(); - queue.pop_front(); - for (OpBase* pre_op : *(candidate->pre_ops_)) { - if (!pre_op) continue; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); - } - ret[pre_op] += 1; - } - } - - return ret; - } - - framework::Scope* scope_; -}; - -framework::Variable* CreateVariable(const std::string& name, - const framework::DDim& dim, float val, - framework::Scope* scope, - bool random_name = true) { - std::string varname = name; - if (random_name) { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); - int id = dist6(rng); - varname = string::Sprintf("%s@%d", varname, id); - } - - VLOG(3) << "creating var " << varname; - framework::Variable* var = scope->Var(varname); - framework::LoDTensor* tensor = var->GetMutable(); - - float* data = tensor->mutable_data(dim, platform::CPUPlace()); - std::fill(data, data + tensor->numel(), val); - return var; -} - -framework::LoDTensor& VarBase::Grad() { - VLOG(3) << "get var grad " << var_desc_->Name(); - return *grads_->GetMutable(); -} - -void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { - VLOG(3) << "apply var grad " << var_desc_->Name() << " " - << grad->Get().data()[0]; - if (!grads_) { - grads_ = - CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), - var_->Get().dims(), 0.0, scope); - } - AddTo(grad, grads_); - VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " - << grads_->Get().data()[0]; -} - -std::vector OpBase::ApplyGrad(framework::Scope* scope) { - VLOG(3) << "op grad " << grad_op_desc_->Type(); - - for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { - if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { - // grad op inputs can be forward inputs, so not in grad_to_var. - continue; - } - VLOG(3) << "op grad in var " << grad_invar; - block_->FindRecursiveOrCreateVar(grad_invar); - framework::Variable* var = scope->Var(grad_invar); - const std::string& invar = grad_to_var_->at(grad_invar); - for (VarBase* varbase : *output_vars_) { - // Use the accumulated grads_ by sharing the input with grads_. - if (varbase->var_desc_->Name() == invar) { - var->GetMutable()->ShareDataWith( - varbase->grads_->Get()); - break; - } - } - } - - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - VLOG(3) << "grad outvar " << outvar; - block_->FindRecursiveOrCreateVar(outvar); - framework::Variable* var = scope->Var(outvar); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(outvar); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - } - grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - - opbase->Run(*scope, platform::CPUPlace()); - - // `ret` matches exactly with `input_vars_` of forward op. - std::vector ret; - for (size_t i = 0; i < input_vars_->size(); ++i) { - bool found = false; - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; - std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); - VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; - origin_var->ApplyGrad(scope, var); - found = true; - ret.push_back(var); - // TODO(panyx0718): There might be another outvar with the same name. - // In that case, it doesn't matter the first one or the second one is - // used. - break; - } - if (!found) { - ret.push_back(nullptr); - } - } - return ret; -} - -void VarBase::RunBackward(framework::Scope* scope) { - grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), - var_->Get().dims(), 1.0, scope, - false); - if (!pre_op_) return; - Autograd(scope).RunBackward(this); -} - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h deleted file mode 100644 index 85a71ca83d..0000000000 --- a/paddle/fluid/imperative/layer.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace imperative { - -class OpBase; - -class VarBase { - public: - VarBase() - : pre_op_(nullptr), - pre_op_out_idx_(-1), - var_desc_(nullptr), - var_(nullptr), - grads_(nullptr) {} - - virtual ~VarBase() {} - - void ApplyGrad(framework::Scope* scope, framework::Variable* grad); - - void RunBackward(framework::Scope* scope); - - framework::LoDTensor& Grad(); - - OpBase* pre_op_; - int pre_op_out_idx_; - - framework::VarDesc* var_desc_; - framework::Variable* var_; - framework::Variable* grads_; -}; - -class OpBase { - public: - OpBase() - : input_vars_(new std::vector()), - output_vars_(new std::vector()), - pre_ops_(new std::vector()), - pre_ops_out_idx_(new std::vector()), - op_desc_(nullptr), - grad_op_desc_(nullptr) {} - - virtual ~OpBase() { - delete input_vars_; - delete output_vars_; - - delete pre_ops_; - delete pre_ops_out_idx_; - - if (grad_op_desc_) delete grad_op_desc_; - if (grad_to_var_) delete grad_to_var_; - } - - std::vector ApplyGrad(framework::Scope* scope); - - std::vector* input_vars_; - std::vector* output_vars_; - std::vector* pre_ops_; - std::vector* pre_ops_out_idx_; - framework::OpDesc* op_desc_; - - framework::OpDesc* grad_op_desc_; - std::unordered_map* grad_to_var_; - framework::BlockDesc* block_; -}; - -class Layer { - public: - virtual ~Layer() {} - - virtual std::vector Forward(const std::vector& inputs) { - std::vector vars; - return vars; - } - - virtual void Backward() { LOG(ERROR) << "To support customize"; } -}; - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc deleted file mode 100644 index f64f9e72c4..0000000000 --- a/paddle/fluid/imperative/tracer.cc +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/tracer.h" - -namespace paddle { -namespace imperative {} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h deleted file mode 100644 index 433d07c0e5..0000000000 --- a/paddle/fluid/imperative/tracer.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/imperative/engine.h" -#include "paddle/fluid/imperative/layer.h" - -namespace paddle { -namespace imperative { - -void CreateGradOp(const framework::OpDesc& op_desc, - const std::unordered_set& no_grad_set, - const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, - std::unordered_map* grad_to_var) { - std::vector> grad_op_descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); - // TODO(panyx0718): Leak? - *grad_op_desc = grad_op_descs[0].release(); -} - -class Tracer { - public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { - root_scope_ = new framework::Scope(); - scopes_[root_block_] = root_scope_; - } - - virtual ~Tracer() { delete root_scope_; } - - void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs, - framework::BlockDesc* block) { - framework::Scope* scope = GetScope(block); - framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type(); - op_desc->InferShape(*block); - op_desc->InferVarType(block); - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(*op_desc); - - *op->input_vars_ = inputs; - for (VarBase* input : inputs) { - const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - input->var_ = var; - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - if (input->pre_op_) { - op->pre_ops_->push_back(input->pre_op_); - op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); - } else { - op->pre_ops_->push_back(nullptr); - } - } - - *op->output_vars_ = outputs; - for (size_t i = 0; i < outputs.size(); ++i) { - const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - outputs[i]->var_ = var; - outputs[i]->pre_op_ = op; - outputs[i]->pre_op_out_idx_ = i; - } - op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; - op->block_ = block; - } - - framework::Scope* GetScope(framework::BlockDesc* block) { - if (scopes_.find(block) != scopes_.end()) { - return scopes_.at(block); - } - framework::BlockDesc* parent_block = block->ParentBlock(); - PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); - framework::Scope* scope = &scopes_[parent_block]->NewScope(); - scopes_[block] = scope; - return scope; - } - - private: - std::map scopes_; - framework::BlockDesc* root_block_; - framework::Scope* root_scope_; -}; - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb126..d602613fc8 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,7 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) - +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc deleted file mode 100644 index 34e9c897d9..0000000000 --- a/paddle/fluid/pybind/imperative.cc +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/pybind/imperative.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/imperative/tracer.h" - -namespace paddle { -namespace pybind { - -// Bind Methods -void BindTracer(pybind11::module *m) { - pybind11::class_(*m, "Tracer", "") - .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); - }) - .def("trace", &imperative::Tracer::Trace) - .def("get_scope", &imperative::Tracer::GetScope, - pybind11::return_value_policy::reference); -} - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h deleted file mode 100644 index 7a9d3a01ea..0000000000 --- a/paddle/fluid/pybind/imperative.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include "paddle/fluid/imperative/layer.h" -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -namespace paddle { -namespace pybind { - -class PyLayer : public imperative::Layer { - public: - using imperative::Layer::Layer; // Inherit constructors - - std::vector Forward( - const std::vector& inputs) override { - PYBIND11_OVERLOAD(std::vector, Layer, Forward, - inputs); // NOLINT - } - - void Backward() override { - PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT - } -}; - -class PyOpBase : public imperative::OpBase { - public: - using imperative::OpBase::OpBase; // Inherit constructors -}; - -class PyVarBase : public imperative::VarBase { - public: - using imperative::VarBase::VarBase; // Inherit constructors -}; - -void BindTracer(pybind11::module* m); - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ea07372a28..fc7991d297 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,7 +34,6 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" -#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -46,7 +45,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -102,42 +100,6 @@ PYBIND11_MODULE(core, m) { BindException(&m); - py::class_(m, "VarBase", R"DOC()DOC") - .def(py::init<>()) - .def("_run_backward", - [](imperative::VarBase &self, framework::Scope *scope) { - self.RunBackward(scope); - }) - .def("_grad", &imperative::VarBase::Grad) - .def_property( - "desc", - [](const imperative::VarBase &self) { return self.var_desc_; }, - [](imperative::VarBase &self, framework::VarDesc *var_desc) { - self.var_desc_ = var_desc; - }, - py::return_value_policy::reference); - - py::class_(m, "OpBase", R"DOC()DOC") - .def(py::init<>()) - .def_property( - "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, - [](imperative::OpBase &self, framework::OpDesc *op_desc) { - if (op_desc) { - self.op_desc_ = op_desc; - } - }, - py::return_value_policy::reference); - - py::class_ layer(m, "Layer"); - layer.def(py::init<>()) - .def("forward", - [](imperative::Layer &self, - const std::vector &inputs) { - return self.Forward(inputs); - }) - .def("backward", &imperative::Layer::Backward); - BindTracer(&m); - py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -639,7 +601,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); - m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 52417a1eaf..2a53519188 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,7 +34,6 @@ from . import io from . import evaluator from . import initializer from . import layers -from . import imperative from . import contrib from . import nets from . import optimizer @@ -68,7 +67,6 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', - 'imperative', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 9e6345f148..b156db53d2 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,7 +18,6 @@ import collections import contextlib import re import six -import sys import numpy as np @@ -50,16 +49,6 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() -_imperative_tracer_ = None - - -def _in_imperative_mode(): - return _imperative_tracer_ is not None - - -def _imperative_tracer(): - return _imperative_tracer_ - class NameScope(object): def __init__(self, name="", parent=None): @@ -213,7 +202,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(core.VarBase): +class Variable(object): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -277,7 +266,6 @@ class Variable(core.VarBase): stop_gradient=False, is_data=False, **kwargs): - core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -358,18 +346,6 @@ class Variable(core.VarBase): self.stop_gradient = stop_gradient self.is_data = is_data - def _numpy(self): - scope = _imperative_tracer().get_scope(self.block.desc) - tensor = core.get_variable_tensor(scope, self.desc.name()) - return np.array(tensor) - - def _backward(self): - scope = _imperative_tracer().get_scope(self.block.desc) - self._run_backward(scope) - - def _gradient(self): - return np.array(self._grad()) - def __str__(self): return self.to_string(True) @@ -516,7 +492,7 @@ class OpProtoHolder(object): } -class Operator(core.OpBase): +class Operator(object): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -572,7 +548,6 @@ class Operator(core.OpBase): inputs=None, outputs=None, attrs=None): - core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -612,7 +587,6 @@ class Operator(core.OpBase): return True return False - self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -639,13 +613,6 @@ class Operator(core.OpBase): else: self.desc.set_input(in_proto.name, []) - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - - self.outputs = [] if outputs is not None: given = set() need = set() @@ -674,12 +641,6 @@ class Operator(core.OpBase): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) - if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -1245,8 +1206,6 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) - if _in_imperative_mode(): - _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -2250,12 +2209,3 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) - - -@contextlib.contextmanager -def _imperative_guard(tracer): - global _imperative_tracer_ - tmp_trace = _imperative_tracer_ - _imperative_tracer_ = tracer - yield - _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py deleted file mode 100644 index 922308b6b1..0000000000 --- a/python/paddle/fluid/imperative/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -from . import base -from .base import * - -from . import layers -from .layers import * - -__all__ = [] -__all__ += layers.__all__ -__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py deleted file mode 100644 index 15d38ddb56..0000000000 --- a/python/paddle/fluid/imperative/base.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import numpy as np - -from paddle.fluid import core -from paddle.fluid import framework - -__all__ = ['enabled', 'guard', 'to_variable'] - - -def enabled(): - return framework._in_imperative_mode() - - -@contextlib.contextmanager -def guard(): - train = framework.Program() - startup = framework.Program() - tracer = core.Tracer(train.current_block().desc) - with framework.program_guard(train, startup): - with framework.unique_name.guard(): - with framework._imperative_guard(tracer): - yield - - -def to_variable(value, block=None): - if isinstance(value, np.ndarray): - if not block: - block = framework.default_main_program().current_block() - py_var = framework.Variable( - block, - type=core.VarDesc.VarType.LOD_TENSOR, - name=None, - shape=value.shape, - dtype=value.dtype) - scope = framework._imperative_tracer().get_scope(block.desc) - var = scope.var(py_var.name) - tensor = var.get_tensor() - tensor.set(value, core.CPUPlace()) - return py_var - elif isinstance(value, framework.Variable): - return value - else: - raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py deleted file mode 100644 index 1a28f7f4ae..0000000000 --- a/python/paddle/fluid/imperative/layers.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import sys -import numpy as np - -from paddle.fluid import core -from paddle.fluid import framework -from paddle.fluid.imperative import base - -__all__ = ['PyLayer'] - - -class PyLayer(core.Layer): - def __init__(self): - pass - - def __call__(self, inputs): - # TODO(panyx0718): Support declarative mode as well. - assert base.enabled() - if not isinstance(inputs, list) and not isinstance(inputs, tuple): - inputs = [inputs] - - var_inputs = [] - for x in inputs: - py_var = base.to_variable(x) - var_inputs.append(py_var) - outputs = self.forward(var_inputs) - return outputs - - def forward(self, inputs): - return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 74b4a977db..dc317de9ab 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,13 +17,10 @@ from __future__ import print_function import copy import itertools import six -import sys -import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier -from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -49,21 +46,23 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() - def to_variable(self, x): - return base.to_variable(x, self.main_program.current_block()) - def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - ret = [] - if isinstance(inputs, list) or isinstance(inputs, tuple): - for inp in inputs: - ret.append(self.to_variable(inp)) + type_error = TypeError( + "Input of {0} layer should be Variable or sequence of Variable". + format(self.layer_type)) + if isinstance(inputs, Variable): + inputs = [inputs] + elif not isinstance(inputs, list) and not isinstance(inputs, tuple): + raise type_error else: - ret.append(self.to_variable(inputs)) - return ret + for each in inputs: + if not isinstance(each, Variable): + raise type_error + return inputs def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fac7538a6a..4833212d31 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6623,8 +6623,7 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) + helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py deleted file mode 100644 index b5b6305155..0000000000 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import sys -import numpy as np - -import paddle.fluid as fluid -from paddle.fluid import core - - -class MyLayer(fluid.imperative.PyLayer): - def __init__(self): - super(MyLayer, self).__init__() - - def forward(self, inputs): - x = fluid.layers.relu(inputs[0]) - self._x_for_debug = x - return [fluid.layers.elementwise_mul(x, x)] - - -class TestImperative(unittest.TestCase): - def test_layer(self): - with fluid.imperative.guard(): - cl = core.Layer() - cl.forward([]) - l = fluid.imperative.PyLayer() - l.forward([]) - - def test_layer_in_out(self): - with fluid.imperative.guard(): - l = MyLayer() - x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] - self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x._numpy())) - x._backward() - sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 0eb69cdb5c..5aee26b638 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,7 +101,6 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', - 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 7e61dde0a4..5c5266f904 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -27,8 +27,6 @@ import pydoc member_dict = collections.OrderedDict() -experimental_namespace = {"paddle.fluid.imperative"} - def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -53,8 +51,6 @@ def visit_member(parent_name, member): def visit_all_module(mod): - if (mod.__name__ in experimental_namespace): - return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) From aebc175cd466e4e256e19d16dd3d2ad6f37dcdd7 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Fri, 7 Dec 2018 09:01:14 +0800 Subject: [PATCH 159/719] add nccl2 dist tests (#14755) * add nccl2 dist tests test=develop * fix dist_base test=develop * fix tests test=develop * fix test on mac test=develop --- .../fluid/tests/unittests/dist_save_load.py | 4 +- .../fluid/tests/unittests/test_dist_base.py | 135 +++++++++++++++--- .../fluid/tests/unittests/test_dist_mnist.py | 13 ++ .../tests/unittests/test_dist_save_load.py | 2 +- .../tests/unittests/test_dist_transpiler.py | 1 + .../fluid/transpiler/distribute_transpiler.py | 25 +--- 6 files changed, 138 insertions(+), 42 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py index cf62817956..faec535042 100644 --- a/python/paddle/fluid/tests/unittests/dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): if args.mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) - if args.is_dist: + if args.update_method == "pserver": t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, @@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): def get_data(): origin_batch = next(reader_generator) - if args.is_dist and args.use_reader_alloc: + if args.update_method == "pserver" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 160969c63f..0a43f53658 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -76,12 +76,24 @@ class TestDistRunnerBase(object): if args.mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) - if args.is_dist: + if args.update_method == "pserver": t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode, args.dc_asgd) trainer_prog = t.get_trainer_program() + elif args.update_method == "nccl2": + # transpile for nccl2 + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + nccl2_t = fluid.DistributeTranspiler(config=config) + nccl2_t.transpile( + args.trainer_id, + program=fluid.default_main_program(), + startup_program=fluid.default_startup_program(), + trainers=args.endpoints, + current_endpoint=args.current_endpoint) + trainer_prog = fluid.default_main_program() else: trainer_prog = fluid.default_main_program() @@ -110,11 +122,20 @@ class TestDistRunnerBase(object): len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") mypass.set_int("num_repeats", args.batch_merge_repeat) + if args.update_method == "nccl2": + num_trainers = len(args.endpoints.split(",")) + trainer_id = args.trainer_id + else: + num_trainers = 1 + trainer_id = 0 + exe = fluid.ParallelExecutor( args.use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, - build_strategy=build_stra) + build_strategy=build_stra, + num_trainers=num_trainers, + trainer_id=trainer_id) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() @@ -126,7 +147,7 @@ class TestDistRunnerBase(object): def get_data(): origin_batch = next(reader_generator) - if args.is_dist and args.use_reader_alloc: + if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: @@ -151,7 +172,11 @@ def runtime_main(test_class): parser.add_argument( '--role', type=str, required=True, choices=['pserver', 'trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") - parser.add_argument('--is_dist', action='store_true') + parser.add_argument( + '--update_method', + type=str, + default="local", + choices=["pserver", "nccl2", "local"]) parser.add_argument('--trainer_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument( @@ -170,7 +195,7 @@ def runtime_main(test_class): args = parser.parse_args() model = test_class() - if args.role == "pserver" and args.is_dist: + if args.role == "pserver" and args.update_method == "pserver": model.run_pserver(args) else: model.run_trainer(args) @@ -208,6 +233,7 @@ class TestDistBase(unittest.TestCase): self._use_reduce = False self._dc_asgd = False # must use with async mode self._use_reader_alloc = True + self._nccl2_mode = False self._setup_config() self._after_setup_config() @@ -218,7 +244,7 @@ class TestDistBase(unittest.TestCase): def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") - ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" + ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver" ps0_cmd = ps_cmd % \ (self._python_interp, model_file, self._ps_endpoints, ps0_ep, self._trainers) @@ -270,7 +296,8 @@ class TestDistBase(unittest.TestCase): else: env_local = {'CPU_NUM': '1'} - envs.update(env_local) + env_local.update(envs) + print("local_cmd: {}, env: {}".format(cmd, env_local)) if check_error_log: err_log = open("/tmp/trainer.err.log", "wb") @@ -278,13 +305,13 @@ class TestDistBase(unittest.TestCase): cmd.split(" "), stdout=subprocess.PIPE, stderr=err_log, - env=envs) + env=env_local) else: local_proc = subprocess.Popen( cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE, - env=envs) + env=env_local) local_out, local_err = local_proc.communicate() @@ -303,7 +330,7 @@ class TestDistBase(unittest.TestCase): ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, 0, ps0_ep, self._trainers) @@ -335,8 +362,8 @@ class TestDistBase(unittest.TestCase): env0.update(envs) env1.update(envs) - print("tr0_cmd:{}".format(tr0_cmd)) - print("tr1_cmd:{}".format(tr1_cmd)) + print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0)) + print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1)) tr0_pipe = open("/tmp/tr0_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb") @@ -357,12 +384,9 @@ class TestDistBase(unittest.TestCase): # close trainer file tr0_pipe.close() tr1_pipe.close() - ps0_pipe.close() ps1_pipe.close() - # FIXME: use terminate() instead of sigkill. - os.kill(ps0.pid, signal.SIGKILL) - os.kill(ps1.pid, signal.SIGKILL) + ps0.terminate() ps1.terminate() @@ -372,7 +396,71 @@ class TestDistBase(unittest.TestCase): sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) - # return tr0_losses, tr1_losses + return pickle.loads(tr0_out), pickle.loads(tr1_out) + + def _run_cluster_nccl2(self, model, envs, check_error_log): + # NOTE: we reuse ps_endpoints as nccl2 worker endpoints + worker_endpoints = self._ps_endpoints.split(",") + w0_ep, w1_ep = worker_endpoints + + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2" + tr0_cmd = tr_cmd % \ + (self._python_interp, model, self._ps_endpoints, + 0, w0_ep) + tr1_cmd = tr_cmd % \ + (self._python_interp, model, self._ps_endpoints, + 1, w1_ep) + + if self._mem_opt: + tr0_cmd += " --mem_opt" + tr1_cmd += " --mem_opt" + if self._use_reduce: + tr0_cmd += " --use_reduce" + tr1_cmd += " --use_reduce" + if self._use_reader_alloc: + tr0_cmd += " --use_reader_alloc" + tr1_cmd += " --use_reader_alloc" + if self.__use_cuda: + tr0_cmd += " --use_cuda" + tr1_cmd += " --use_cuda" + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + else: + env0 = {'CPU_NUM': '1'} + env1 = {'CPU_NUM': '1'} + + env0.update(envs) + env1.update(envs) + + print("tr0_cmd:{}, env: {}".format(tr0_cmd, env0)) + print("tr1_cmd:{}, env: {}".format(tr1_cmd, env1)) + tr0_pipe = open("/tmp/tr0_err.log", "wb") + tr1_pipe = open("/tmp/tr1_err.log", "wb") + + tr0_proc = subprocess.Popen( + tr0_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=tr0_pipe, + env=env0) + tr1_proc = subprocess.Popen( + tr1_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=tr1_pipe, + env=env1) + + tr0_out, tr0_err = tr0_proc.communicate() + tr1_out, tr1_err = tr1_proc.communicate() + + # close trainer file + tr0_pipe.close() + tr1_pipe.close() + + # print log + sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) + sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out) + sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out) + return pickle.loads(tr0_out), pickle.loads(tr1_out) def check_with_place(self, @@ -387,20 +475,25 @@ class TestDistBase(unittest.TestCase): "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_cudnn_deterministic": "1", - "http_proxy": "" + "http_proxy": "", + "NCCL_P2P_DISABLE": "1" } required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "7" + required_envs["GLOG_v"] = "3" required_envs["GLOG_logtostderr"] = "1" local_losses\ = self._run_local(model_file, required_envs, check_error_log) - tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs, - check_error_log) + if self._nccl2_mode: + tr0_losses, tr1_losses = self._run_cluster_nccl2( + model_file, required_envs, check_error_log) + else: + tr0_losses, tr1_losses = self._run_cluster( + model_file, required_envs, check_error_log) for step_id in range(RUN_STEP): local_loss = local_losses[step_id] diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 81eb651878..630bed198f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -26,6 +26,19 @@ class TestDistMnist2x2(TestDistBase): self.check_with_place("dist_mnist.py", delta=1e-5) +class TestDistMnistNCCL2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1) + + class TestDistMnist2x2Lars(TestDistBase): def _setup_config(self): self._sync_mode = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py index ea2b554dac..4588ca7c17 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -44,7 +44,7 @@ class TestDistSaveLoadDense2x2(TestDistBase): required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "7" + required_envs["GLOG_v"] = "3" required_envs["GLOG_logtostderr"] = "1" model_dir = tempfile.mkdtemp() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 194387bc98..d9ad4e2e2c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -769,6 +769,7 @@ class TestNCCL2Transpile(TranspilerTest): config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" + config.wait_port = False t = fluid.DistributeTranspiler(config=config) t.transpile( 0, diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 1d867d9194..3b898af706 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -142,6 +142,7 @@ class DistributeTranspilerConfig(object): # supported modes: pserver, nccl2 mode = "pserver" print_log = False + wait_port = True class DistributeTranspiler(object): @@ -171,7 +172,6 @@ class DistributeTranspiler(object): trainer_id = 0 trainers = 4 role = os.getenv("PADDLE_TRAINING_ROLE") - t = fluid.DistributeTranspiler() t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers) @@ -214,13 +214,16 @@ class DistributeTranspiler(object): trainer_id, trainers, current_endpoint, - startup_program=None): + startup_program=None, + wait_port=True): if not startup_program: startup_program = default_startup_program() if trainer_id >= 0: worker_endpoints = trainers.split(",") # send NCCL_ID to others or recv from trainer 0 worker_endpoints.remove(current_endpoint) + if trainer_id == 0 and wait_port: + wait_server_ready(worker_endpoints) nccl_id_var = startup_program.global_block().create_var( name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) @@ -306,7 +309,8 @@ class DistributeTranspiler(object): trainer_id, trainers, current_endpoint, - startup_program=startup_program) + startup_program=startup_program, + wait_port=self.config.wait_port) return self.trainer_num = trainers @@ -652,9 +656,6 @@ class DistributeTranspiler(object): # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. - sys.stderr.write("get_pserver_program() is deprecated, call \ -get_pserver_programs() to get pserver main and startup \ -in a single call.") # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed @@ -922,18 +923,6 @@ in a single call.") Returns: Program: parameter server side startup program. """ - sys.stderr.write("get_startup_program() is deprecated, call \ -get_pserver_programs() to get pserver main and startup \ -in a single call.") - if pserver_program != None: - sys.stderr.write("passing pserver_program to get_startup_program() \ -is deprecated, you can use new API get_pserver_programs() to \ -get both pserver main program and startup program.") - if startup_program != None: - sys.stderr.write("passing startup_program to get_startup_program() \ -is deprecated, use fluid.program_guard() or pass this argument \ -to transpile() call.") - s_prog = Program() orig_s_prog = self.startup_program s_prog.random_seed = orig_s_prog.random_seed From 155328a488f12b91555cf31c9b456f9fc3db9ebc Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Fri, 7 Dec 2018 10:15:41 +0800 Subject: [PATCH 160/719] Clean Code test=develop --- paddle/fluid/operators/conv_mkldnn_op.cc | 124 +++++++++-------------- paddle/fluid/operators/conv_op.cc | 12 +-- 2 files changed, 53 insertions(+), 83 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 786bdb10a2..154ff2bb20 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -28,6 +28,46 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; +inline void GetWeightsTz(std::vector& weights_tz, int groups, // NOLINT + bool is_conv3d) { + if (groups > 1) { + if (is_conv3d) { + int output = weights_tz[0]; + int input = weights_tz[1]; + int dimension = weights_tz[2]; + int height = weights_tz[3]; + int width = weights_tz[4]; + weights_tz.resize(6); + weights_tz[0] = groups; + weights_tz[1] = output / groups; + weights_tz[2] = input; + weights_tz[3] = dimension; + weights_tz[4] = height; + weights_tz[5] = width; + } else { + int output = weights_tz[0]; + int input = weights_tz[1]; + int height = weights_tz[2]; + int width = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = groups; + weights_tz[1] = output / groups; + weights_tz[2] = input; + weights_tz[3] = height; + weights_tz[4] = width; + } + } +} + +inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, + int groups, bool is_conv3d) { + if (is_conv3d) { + return (groups == 1) ? format : mkldnn::memory::format::goidhw; + } else { + return (groups == 1) ? format : mkldnn::memory::format::goihw; + } +} + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -53,7 +93,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { filter->format() != memory::format::format_undef, "Wrong layout/format set for Filter tensor"); PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, - "Input must be with 4 or 5dimensions, i.e. NCHW or NCDHW"); + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); if (bias) { @@ -87,33 +127,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); int g = std::max(groups, 1); - if (g > 1) { - if (is_conv3d) { - int o = weights_tz[0]; - int i = weights_tz[1]; - int d = weights_tz[2]; - int h = weights_tz[3]; - int w = weights_tz[4]; - weights_tz.resize(6); - weights_tz[0] = g; - weights_tz[1] = o / g; - weights_tz[2] = i; - weights_tz[3] = d; - weights_tz[4] = h; - weights_tz[5] = w; - } else { - int o = weights_tz[0]; - int i = weights_tz[1]; - int h = weights_tz[2]; - int w = weights_tz[3]; - weights_tz.resize(5); - weights_tz[0] = g; - weights_tz[1] = o / g; - weights_tz[2] = i; - weights_tz[3] = h; - weights_tz[4] = w; - } - } + GetWeightsTz(weights_tz, g, is_conv3d); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // Get unique name for storing MKLDNN primitives @@ -126,12 +140,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_format = input->format(); mkldnn::memory::format weights_format = - (g == 1) ? filter->format() : mkldnn::memory::format::goihw; - - if (is_conv3d) { - weights_format = - (g == 1) ? filter->format() : mkldnn::memory::format::goidhw; - } + GetWeightsFormat(filter->format(), g, is_conv3d); auto user_src_md = platform::MKLDNNMemDesc( {src_tz}, platform::MKLDNNGetDataType(), src_format); @@ -146,15 +155,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - weights_format = - (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw; - if (is_conv3d) { chosen_memory_format = platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); - weights_format = - (g == 1) ? chosen_memory_format : mkldnn::memory::format::goidhw; } + weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -397,43 +402,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); int g = std::max(groups, 1); - if (g > 1) { - if (is_conv3d) { - int o = weights_tz[0]; - int i = weights_tz[1]; - int d = weights_tz[2]; - int h = weights_tz[3]; - int w = weights_tz[4]; - weights_tz.resize(6); - weights_tz[0] = g; - weights_tz[1] = o / g; - weights_tz[2] = i; - weights_tz[3] = d; - weights_tz[4] = h; - weights_tz[5] = w; - } else { - int o = weights_tz[0]; - int i = weights_tz[1]; - int h = weights_tz[2]; - int w = weights_tz[3]; - weights_tz.resize(5); - weights_tz[0] = g; - weights_tz[1] = o / g; - weights_tz[2] = i; - weights_tz[3] = h; - weights_tz[4] = w; - } - } + GetWeightsTz(weights_tz, g, is_conv3d); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); auto src_format = input->format(); mkldnn::memory::format weights_format = - (g == 1) ? filter->format() : mkldnn::memory::format::goihw; - - if (is_conv3d) { - weights_format = - (g == 1) ? filter->format() : mkldnn::memory::format::goidhw; - } + GetWeightsFormat(filter->format(), g, is_conv3d); // Get an unique name from "argument" name of "Output" variable // as well as attributes of primitive to be created @@ -461,15 +435,11 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - weights_format = - (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw; - if (is_conv3d) { chosen_memory_format = platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); - weights_format = - (g == 1) ? chosen_memory_format : mkldnn::memory::format::goidhw; } + weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 9abba5f512..d7b8766288 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -134,14 +134,14 @@ void Conv2DOpMaker::Make() { "The format of output tensor is X (one-dimensional) of size equal" "to the number of output channels. Only used with MKL-DNN.") .AsDispensable(); - AddOutput("Output", - "(Tensor) The output tensor of convolution operator. " - "The format of output tensor is also NCHW."); AddInput("ResidualData", "(Tensor) Tensor with residual data " "to which convolution output will be added." "Used with fuse_residual_connection fusion.") .AsDispensable(); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -251,14 +251,14 @@ void Conv3DOpMaker::Make() { "is the width of the filter." "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); - AddOutput("Output", - "(Tensor) The output tensor of convolution operator." - "The format of output tensor is also NCDHW."); AddInput("ResidualData", "(Tensor) Tensor with residual data " "to which convolution output will be added." "Used with fuse_residual_connection fusion.") .AsDispensable(); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator." + "The format of output tensor is also NCDHW."); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " From 387bac46b5e4d95e2888773975d1b6c3a906a588 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:09:43 +0000 Subject: [PATCH 161/719] refine code test=develop --- .../framework/details/eager_deletion_pass.cc | 10 +- .../fluid/framework/details/op_graph_view.cc | 2 + .../framework/details/reference_count_pass.cc | 14 +- .../details/reference_count_pass_helper.h | 10 +- .../scope_buffered_ssa_graph_executor.cc | 8 +- .../scope_buffered_ssa_graph_executor.h | 2 +- paddle/fluid/framework/executor.cc | 14 +- paddle/fluid/framework/executor.h | 6 +- paddle/fluid/framework/parallel_executor.cc | 153 ++++++++++-------- .../fluid/operators/controlflow/while_op.cc | 10 +- 10 files changed, 122 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 3a1b37e533..85991c71e6 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -31,10 +31,11 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( const auto &vars = graph->Get(kGraphVars); auto &ref_cnts = - Get>(kCurReferenceCount); + Get>(kRuntimeReferenceCount); const auto &last_live_ops = Get>(kLastLiveOpsOfVars); - auto &gcs = Get(kGarbageCollector); + auto &gcs = Get(kGarbageCollector); + const auto &places = Get>(kAllPlaces); ref_cnts = std::vector(vars.size()); @@ -58,7 +59,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); auto *eager_deletion_op = new EagerDeletionOpHandle( eager_deletion_node, op->GetScope(), op->GetPlace(), - std::move(var_names), gcs[op->GetScopeIdx()].get(), + std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(), &(ref_cnts[op->GetScopeIdx()])); auto it = std::find_if( @@ -90,6 +91,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( REGISTER_PASS(eager_deletion_pass, paddle::framework::details::EagerDeletionPass) - .RequirePassAttr(paddle::framework::details::kCurReferenceCount) + .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount) .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index 4838c4198f..b6b5ad42c4 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -23,6 +23,8 @@ namespace details { OpGraphView::OpGraphView(const std::vector &ops) { Build(ops); } void OpGraphView::Build(const std::vector &ops) { + preceding_ops_.clear(); + pending_ops_.clear(); for (auto &op : ops) { preceding_ops_[op]; pending_ops_[op]; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 0c096e0980..f2c9dfb524 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,22 +29,22 @@ namespace paddle { namespace framework { namespace details { -class OpConnectionDetector { +class OpRelationDetector { public: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; - explicit OpConnectionDetector(const std::vector &all_ops) + explicit OpRelationDetector(const std::vector &all_ops) : graph_(all_ops) {} template - OpSet MaxNoDepOps(const OpSet &op_set) { - if (op_set.size() <= 1) return op_set; + OpSet MaxNoDepOps(const OpSet &op_set) const { using KeyType = typename OpSet::key_type; static_assert( std::is_base_of::type>::value, - "Key type of OpSet must be or derived of OpHandleBase"); + "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase"); + if (op_set.size() <= 1) return op_set; std::vector ops(op_set.begin(), op_set.end()); OpSet ret; auto rels = GetRelations(ops); @@ -59,7 +59,7 @@ class OpConnectionDetector { private: std::vector> GetRelations( - const std::vector ops) { + const std::vector ops) const { std::unordered_map op_to_idx; for (size_t i = 0; i < ops.size(); ++i) { PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); @@ -144,7 +144,7 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( last_live_ops_of_vars = std::vector(vars.size()); ref_cnts = std::vector(vars.size()); - OpConnectionDetector detector(ir::FilterByNodeWrapper(*graph)); + OpRelationDetector detector(ir::FilterByNodeWrapper(*graph)); for (size_t i = 0; i < vars.size(); ++i) { for (auto &name_var_pair : vars[i]) { diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 77846f7bdf..eb534f9701 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -33,12 +34,13 @@ using ReferenceCountMap = std::unordered_map; using AtomicReferenceCountMap = std::unordered_map>; -using GarbageCollectorList = - std::vector>>; +using GarbageCollectorMap = + std::map>>; -const char kGlobalReferenceCount[] = "reference_count"; -const char kCurReferenceCount[] = "current_reference_count"; +const char kGlobalReferenceCount[] = "global_reference_count"; +const char kRuntimeReferenceCount[] = "runtime_reference_count"; const char kGarbageCollector[] = "garbage_collector"; +const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = std::unordered_map>; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index da5e277f27..b8775fc329 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -32,15 +32,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( var_infos_(std::move(var_infos)), places_(std::move(places)) { if (Graph().Has(details::kGarbageCollector)) { - gc_ = &(Graph().Get(details::kGarbageCollector)); + gc_ = &(Graph().Get(details::kGarbageCollector)); } } void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { if (gc_) { - for (auto &gc : *gc_) { - gc->Wait(); - gc->Reset(); + for (auto &gc_pair : *gc_) { + gc_pair.second->Wait(); + gc_pair.second->Reset(); } } } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 4d52183a20..6086a219e0 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -60,7 +60,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector var_infos_; std::vector places_; - GarbageCollectorList* gc_{nullptr}; + GarbageCollectorMap* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f443c2d8cf..04425a5983 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -56,13 +56,7 @@ static std::unordered_map GetNonPersistableReferenceCounts( type != proto::VarType::LOD_TENSOR_ARRAY) { continue; } - - auto it = ref_cnts.find(name); - if (it != ref_cnts.end()) { - ++it->second; - } else { - ref_cnts[name] = 1; - } + ++ref_cnts[name]; } } }; @@ -79,8 +73,8 @@ ExecutorPrepareContext::ExecutorPrepareContext( const std::vector& skip_ref_cnt_vars) : prog_(prog), block_id_(block_id) { if (GetEagerDeletionThreshold() >= 0) { - ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), + skip_ref_cnt_vars); } } @@ -443,7 +437,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, if (gc) { DeleteUnusedTensors(*local_scope, op.get(), gc.get(), - &(ctx->cur_ref_cnts_)); + &(ctx->runtime_ref_cnts_)); } } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 412ebd1904..5a040ac641 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -34,14 +34,14 @@ struct ExecutorPrepareContext { ~ExecutorPrepareContext(); - void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } + void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; } const framework::ProgramDesc& prog_; size_t block_id_; std::vector> ops_; - std::unordered_map ref_cnts_; - std::unordered_map cur_ref_cnts_; + std::unordered_map global_ref_cnts_; + std::unordered_map runtime_ref_cnts_; }; class Executor { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3d466e44a1..dfd031f119 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -51,11 +51,22 @@ class ParallelExecutorPrivate { } } - void ResetRuntimeReferenceCount() { - for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) { - for (auto &pair : rt_ref_cnts_[i]) { - rt_cur_ref_cnts_[i][pair.first] = pair.second; + std::unique_ptr PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size); + + inline bool HasGarbageCollectors() const { return !gcs_.empty(); } + + void ResetRuntimeReferenceCount(const std::vector &fetch_tensors, + const std::string &fetched_var_name) { + for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) { + for (auto &pair : global_ref_cnts_[i]) { + runtime_ref_cnts_[i][pair.first] = pair.second; + } + + for (auto &fetch_name : fetch_tensors) { + runtime_ref_cnts_[i].erase(fetch_name); } + runtime_ref_cnts_[i].erase(fetched_var_name); } } @@ -71,14 +82,75 @@ class ParallelExecutorPrivate { bool use_cuda_; bool use_all_reduce_; - // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then - // keeps unchanged - // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_ - std::vector rt_ref_cnts_; - std::vector rt_cur_ref_cnts_; - details::GarbageCollectorList gcs_; + // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and + // then keeps unchanged + // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_ + std::vector global_ref_cnts_; + std::vector runtime_ref_cnts_; + details::GarbageCollectorMap gcs_; }; +std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size) { + for (size_t i = 0; i < places_.size(); ++i) { + auto &place = places_[i]; + if (gcs_.count(place) > 0) { + continue; + } +#ifdef PADDLE_WITH_CUDA + GarbageCollector *gc = nullptr; + if (platform::is_gpu_place(place)) { + if (IsFastEagerDeletionModeEnabled()) { + gc = new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size); + } else { + gc = new StreamGarbageCollector( + boost::get(place), max_memory_size); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else if (platform::is_cpu_place(place)) { +#endif + gc = new CPUGarbageCollector( + boost::get(place), max_memory_size); + VLOG(10) << "Created GarbageCollector at " << place; +#ifdef PADDLE_WITH_CUDA + } +#endif + + if (gc) { + gcs_[place] = std::unique_ptr>(gc); + } + } + + if (gcs_.empty()) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, + &global_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + graph = ref_cnt_pass->Apply(std::move(graph)); + VLOG(10) << "ReferenceCountPass Applied"; + + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount, + &runtime_ref_cnts_); + eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); + graph = eager_deletion_pass->Apply(std::move(graph)); + VLOG(10) << "EagerDeletionPass Applied"; + + graph->SetNotOwned(details::kGarbageCollector, &gcs_); + } + + return graph; +} + std::vector &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } @@ -153,54 +225,8 @@ ParallelExecutor::ParallelExecutor( auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - size_t place_num = member_->places_.size(); - for (size_t i = 0; i < place_num; ++i) { - auto &place = member_->places_[i]; -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(place)) { - if (IsFastEagerDeletionModeEnabled()) { - member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector( - boost::get(place), max_memory_size)); - } else { - member_->gcs_.emplace_back(new StreamGarbageCollector( - boost::get(place), max_memory_size)); - } - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; - } else if (platform::is_cpu_place(place)) { -#endif - member_->gcs_.emplace_back(new CPUGarbageCollector( - boost::get(place), max_memory_size)); - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#ifdef PADDLE_WITH_CUDA - } -#endif - } - } - - if (!member_->gcs_.empty()) { - std::vector last_live_ops_of_vars; - - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, - &(member_->rt_ref_cnts_)); - ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - graph = ref_cnt_pass->Apply(std::move(graph)); - VLOG(10) << "ReferenceCountPass Applied"; - - auto eager_deletion_pass = - ir::PassRegistry::Instance().Get("eager_deletion_pass"); - eager_deletion_pass->SetNotOwned(details::kCurReferenceCount, - &(member_->rt_cur_ref_cnts_)); - eager_deletion_pass->SetNotOwned(details::kGarbageCollector, - &(member_->gcs_)); - eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - graph = eager_deletion_pass->Apply(std::move(graph)); - VLOG(10) << "EagerDeletionPass Applied"; - - graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_)); + graph = member_->PrepareGCAndRefCnts(std::move(graph), + static_cast(max_memory_size)); } // Step 3. Create vars in each scope. Passes may also create new vars. @@ -316,15 +342,8 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); - if (!member_->gcs_.empty()) { - member_->ResetRuntimeReferenceCount(); - size_t n = member_->rt_ref_cnts_.size(); - for (size_t i = 0; i < n; ++i) { - for (auto &fetch_name : fetch_tensors) { - member_->rt_cur_ref_cnts_[i].erase(fetch_name); - } - member_->rt_cur_ref_cnts_[i].erase(fetched_var_name); - } + if (member_->HasGarbageCollectors()) { + member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name); } auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index da7cad82d8..06920a47ee 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -74,9 +74,7 @@ class WhileOp : public framework::OperatorBase { bool is_test = Attr("is_test"); auto &skip_vars = Attr>(kSkipEagerDeletionVars); - if (framework::GetEagerDeletionThreshold() >= 0) { - VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); - } + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { @@ -144,9 +142,7 @@ class WhileGradOp : public framework::OperatorBase { auto *program = block->Program(); auto &skip_vars = Attr>(kSkipEagerDeletionVars); - if (framework::GetEagerDeletionThreshold() >= 0) { - VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); - } + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = @@ -369,7 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The followi_ng codes are used in eager deletion mode */ + /* The following codes are used in eager deletion mode */ std::unordered_set bwd_skip_vars; if (framework::GetEagerDeletionThreshold() >= 0) { std::unordered_set fwd_skip_vars; From 644baa2e45b64f5a52e237ca1981cb30a5043e0c Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:30:17 +0000 Subject: [PATCH 162/719] fix code bug in CPU compilation test=develop --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfd031f119..fd2bcb8848 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,8 +97,8 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } -#ifdef PADDLE_WITH_CUDA GarbageCollector *gc = nullptr; +#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { gc = new UnsafeFastGPUGarbageCollector( From 8095fb5e686d3e32f1838dfe7fbf4d0108ef1f25 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 03:30:17 +0000 Subject: [PATCH 163/719] fix code bug in CPU compilation test=develop --- paddle/fluid/framework/parallel_executor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfd031f119..e51b1f1f73 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,8 +97,8 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } -#ifdef PADDLE_WITH_CUDA GarbageCollector *gc = nullptr; +#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { gc = new UnsafeFastGPUGarbageCollector( @@ -122,7 +122,7 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( } } - if (gcs_.empty()) { + if (!gcs_.empty()) { std::vector last_live_ops_of_vars; auto ref_cnt_pass = From 2538ef64f1ead35d4b2c84d68d427dce8a92cb48 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 7 Dec 2018 13:22:19 +0800 Subject: [PATCH 164/719] Revert "Revert "Imperative"" --- paddle/fluid/CMakeLists.txt | 1 + paddle/fluid/framework/feed_fetch_method.cc | 9 + paddle/fluid/framework/feed_fetch_method.h | 2 + paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/imperative/CMakeLists.txt | 3 + paddle/fluid/imperative/engine.cc | 53 +++++ paddle/fluid/imperative/engine.h | 39 ++++ paddle/fluid/imperative/layer.cc | 221 ++++++++++++++++++ paddle/fluid/imperative/layer.h | 102 ++++++++ paddle/fluid/imperative/tracer.cc | 19 ++ paddle/fluid/imperative/tracer.h | 128 ++++++++++ paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/imperative.cc | 36 +++ paddle/fluid/pybind/imperative.h | 53 +++++ paddle/fluid/pybind/pybind.cc | 39 ++++ python/paddle/fluid/__init__.py | 2 + python/paddle/fluid/framework.py | 54 ++++- python/paddle/fluid/imperative/__init__.py | 25 ++ python/paddle/fluid/imperative/base.py | 56 +++++ python/paddle/fluid/imperative/layers.py | 44 ++++ python/paddle/fluid/layer_helper.py | 23 +- python/paddle/fluid/layers/nn.py | 3 +- .../fluid/tests/unittests/test_imperative.py | 52 +++++ python/setup.py.in | 1 + tools/print_signatures.py | 4 + 25 files changed, 960 insertions(+), 19 deletions(-) create mode 100644 paddle/fluid/imperative/CMakeLists.txt create mode 100644 paddle/fluid/imperative/engine.cc create mode 100644 paddle/fluid/imperative/engine.h create mode 100644 paddle/fluid/imperative/layer.cc create mode 100644 paddle/fluid/imperative/layer.h create mode 100644 paddle/fluid/imperative/tracer.cc create mode 100644 paddle/fluid/imperative/tracer.h create mode 100644 paddle/fluid/pybind/imperative.cc create mode 100644 paddle/fluid/pybind/imperative.h create mode 100644 python/paddle/fluid/imperative/__init__.py create mode 100644 python/paddle/fluid/imperative/base.py create mode 100644 python/paddle/fluid/imperative/layers.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 6b526f0103..595454e90b 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3e9353f5cf..6338be75a4 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { + Variable* var = scope.FindVar(var_name); + PADDLE_ENFORCE(var, "%s no in scope", var_name); + PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); + return *var->GetMutable(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 7f504bfd23..031f8e01aa 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bba..8679118fe2 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); + LOG(ERROR) << "Cannot add backward operator before forward operator " + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt new file mode 100644 index 0000000000..373d292b44 --- /dev/null +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -0,0 +1,3 @@ +cc_library(layer SRCS layer.cc DEPS proto_desc operator) +cc_library(tracer SRCS tracer.cc DEPS proto_desc) +cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc new file mode 100644 index 0000000000..de7ab0e591 --- /dev/null +++ b/paddle/fluid/imperative/engine.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/engine.h" + +#include // NOLINT +#include + +#include "glog/logging.h" + +namespace paddle { +namespace imperative { + +static std::once_flag init_engine; +static Engine* engine; + +class DummyEngine : public Engine { + public: + void Enqueue(Runnable* runnable) override { + queued_runnables_.push_back(runnable); + } + + size_t Size() const override { return queued_runnables_.size(); } + + void Sync() override { + for (Runnable* l : queued_runnables_) { + LOG(INFO) << "running " << reinterpret_cast(l); + } + queued_runnables_.clear(); + } + + private: + std::vector queued_runnables_; +}; + +Engine* GetEngine() { + std::call_once(init_engine, []() { engine = new DummyEngine(); }); + return engine; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h new file mode 100644 index 0000000000..a1dfa5bda3 --- /dev/null +++ b/paddle/fluid/imperative/engine.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace imperative { + +struct Runnable {}; + +class Engine { + public: + virtual ~Engine() {} + + virtual void Enqueue(Runnable* runnable) = 0; + + virtual size_t Size() const = 0; + + virtual void Sync() = 0; +}; + +Engine* GetEngine(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc new file mode 100644 index 0000000000..6125037680 --- /dev/null +++ b/paddle/fluid/imperative/layer.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/layer.h" +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace imperative { + +using framework::Variable; + +void AddTo(Variable* src, Variable* dst) { + framework::LoDTensor* dst_tensor = dst->GetMutable(); + framework::LoDTensor* src_tensor = src->GetMutable(); + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", + dst_tensor->numel(), src_tensor->numel()); + float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); + const float* src_data = src_tensor->data(); + for (size_t i = 0; i < src_tensor->numel(); ++i) { + dst_data[i] += src_data[i]; + } +} + +class Autograd { + public: + explicit Autograd(framework::Scope* scope) : scope_(scope) {} + + void RunBackward(VarBase* var) { + PADDLE_ENFORCE(var->pre_op_->op_desc_); + // TODO(panyx0718): Only create for vars that "require_grad" + (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; + + std::deque ready; + ready.push_back(var->pre_op_); + + std::map dep_counts = ComputeDepCounts(var->pre_op_); + + while (!ready.empty()) { + OpBase* ready_op = ready.front(); + ready.pop_front(); + std::vector input_grads = ready_op->ApplyGrad(scope_); + + for (size_t i = 0; i < input_grads.size(); ++i) { + if (!input_grads[i]) continue; + OpBase* pre_op = ready_op->pre_ops_->at(i); + if (!pre_op) continue; + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } + } + } + } + + private: + std::map ComputeDepCounts(OpBase* op) { + std::map ret; + + std::deque queue; + queue.push_back(op); + std::unordered_set visited; + visited.insert(op); + while (!queue.empty()) { + OpBase* candidate = queue.front(); + queue.pop_front(); + for (OpBase* pre_op : *(candidate->pre_ops_)) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; + } + } + + return ret; + } + + framework::Scope* scope_; +}; + +framework::Variable* CreateVariable(const std::string& name, + const framework::DDim& dim, float val, + framework::Scope* scope, + bool random_name = true) { + std::string varname = name; + if (random_name) { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + int id = dist6(rng); + varname = string::Sprintf("%s@%d", varname, id); + } + + VLOG(3) << "creating var " << varname; + framework::Variable* var = scope->Var(varname); + framework::LoDTensor* tensor = var->GetMutable(); + + float* data = tensor->mutable_data(dim, platform::CPUPlace()); + std::fill(data, data + tensor->numel(), val); + return var; +} + +framework::LoDTensor& VarBase::Grad() { + VLOG(3) << "get var grad " << var_desc_->Name(); + return *grads_->GetMutable(); +} + +void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { + VLOG(3) << "apply var grad " << var_desc_->Name() << " " + << grad->Get().data()[0]; + if (!grads_) { + grads_ = + CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), + var_->Get().dims(), 0.0, scope); + } + AddTo(grad, grads_); + VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " + << grads_->Get().data()[0]; +} + +std::vector OpBase::ApplyGrad(framework::Scope* scope) { + VLOG(3) << "op grad " << grad_op_desc_->Type(); + + for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { + if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { + // grad op inputs can be forward inputs, so not in grad_to_var. + continue; + } + VLOG(3) << "op grad in var " << grad_invar; + block_->FindRecursiveOrCreateVar(grad_invar); + framework::Variable* var = scope->Var(grad_invar); + const std::string& invar = grad_to_var_->at(grad_invar); + for (VarBase* varbase : *output_vars_) { + // Use the accumulated grads_ by sharing the input with grads_. + if (varbase->var_desc_->Name() == invar) { + var->GetMutable()->ShareDataWith( + varbase->grads_->Get()); + break; + } + } + } + + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + VLOG(3) << "grad outvar " << outvar; + block_->FindRecursiveOrCreateVar(outvar); + framework::Variable* var = scope->Var(outvar); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(outvar); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + } + grad_op_desc_->InferShape(*block_); + grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc_); + + opbase->Run(*scope, platform::CPUPlace()); + + // `ret` matches exactly with `input_vars_` of forward op. + std::vector ret; + for (size_t i = 0; i < input_vars_->size(); ++i) { + bool found = false; + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + Variable* var = scope->FindVar(outvar); + VarBase* origin_var = (*input_vars_)[i]; + std::string orig_var = grad_to_var_->at(outvar); + PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; + origin_var->ApplyGrad(scope, var); + found = true; + ret.push_back(var); + // TODO(panyx0718): There might be another outvar with the same name. + // In that case, it doesn't matter the first one or the second one is + // used. + break; + } + if (!found) { + ret.push_back(nullptr); + } + } + return ret; +} + +void VarBase::RunBackward(framework::Scope* scope) { + grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), + var_->Get().dims(), 1.0, scope, + false); + if (!pre_op_) return; + Autograd(scope).RunBackward(this); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h new file mode 100644 index 0000000000..85a71ca83d --- /dev/null +++ b/paddle/fluid/imperative/layer.h @@ -0,0 +1,102 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace imperative { + +class OpBase; + +class VarBase { + public: + VarBase() + : pre_op_(nullptr), + pre_op_out_idx_(-1), + var_desc_(nullptr), + var_(nullptr), + grads_(nullptr) {} + + virtual ~VarBase() {} + + void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + + void RunBackward(framework::Scope* scope); + + framework::LoDTensor& Grad(); + + OpBase* pre_op_; + int pre_op_out_idx_; + + framework::VarDesc* var_desc_; + framework::Variable* var_; + framework::Variable* grads_; +}; + +class OpBase { + public: + OpBase() + : input_vars_(new std::vector()), + output_vars_(new std::vector()), + pre_ops_(new std::vector()), + pre_ops_out_idx_(new std::vector()), + op_desc_(nullptr), + grad_op_desc_(nullptr) {} + + virtual ~OpBase() { + delete input_vars_; + delete output_vars_; + + delete pre_ops_; + delete pre_ops_out_idx_; + + if (grad_op_desc_) delete grad_op_desc_; + if (grad_to_var_) delete grad_to_var_; + } + + std::vector ApplyGrad(framework::Scope* scope); + + std::vector* input_vars_; + std::vector* output_vars_; + std::vector* pre_ops_; + std::vector* pre_ops_out_idx_; + framework::OpDesc* op_desc_; + + framework::OpDesc* grad_op_desc_; + std::unordered_map* grad_to_var_; + framework::BlockDesc* block_; +}; + +class Layer { + public: + virtual ~Layer() {} + + virtual std::vector Forward(const std::vector& inputs) { + std::vector vars; + return vars; + } + + virtual void Backward() { LOG(ERROR) << "To support customize"; } +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc new file mode 100644 index 0000000000..f64f9e72c4 --- /dev/null +++ b/paddle/fluid/imperative/tracer.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace imperative {} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h new file mode 100644 index 0000000000..433d07c0e5 --- /dev/null +++ b/paddle/fluid/imperative/tracer.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/engine.h" +#include "paddle/fluid/imperative/layer.h" + +namespace paddle { +namespace imperative { + +void CreateGradOp(const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + framework::OpDesc** grad_op_desc, + std::unordered_map* grad_to_var) { + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); + // TODO(panyx0718): Leak? + *grad_op_desc = grad_op_descs[0].release(); +} + +class Tracer { + public: + explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + root_scope_ = new framework::Scope(); + scopes_[root_block_] = root_scope_; + } + + virtual ~Tracer() { delete root_scope_; } + + void Trace(OpBase* op, const std::vector& inputs, + const std::vector& outputs, + framework::BlockDesc* block) { + framework::Scope* scope = GetScope(block); + framework::OpDesc* op_desc = op->op_desc_; + VLOG(3) << "tracer tracing " << op_desc->Type(); + op_desc->InferShape(*block); + op_desc->InferVarType(block); + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(*op_desc); + + *op->input_vars_ = inputs; + for (VarBase* input : inputs) { + const std::string vname = input->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + input->var_ = var; + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + if (input->pre_op_) { + op->pre_ops_->push_back(input->pre_op_); + op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); + } else { + op->pre_ops_->push_back(nullptr); + } + } + + *op->output_vars_ = outputs; + for (size_t i = 0; i < outputs.size(); ++i) { + const std::string vname = outputs[i]->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + outputs[i]->var_ = var; + outputs[i]->pre_op_ = op; + outputs[i]->pre_op_out_idx_ = i; + } + op_base->Run(*scope, platform::CPUPlace()); + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + op->block_ = block; + } + + framework::Scope* GetScope(framework::BlockDesc* block) { + if (scopes_.find(block) != scopes_.end()) { + return scopes_.at(block); + } + framework::BlockDesc* parent_block = block->ParentBlock(); + PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); + framework::Scope* scope = &scopes_[parent_block]->NewScope(); + scopes_[block] = scope; + return scope; + } + + private: + std::map scopes_; + framework::BlockDesc* root_block_; + framework::Scope* root_scope_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d602613fc8..b8954cb126 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,7 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) + if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc new file mode 100644 index 0000000000..34e9c897d9 --- /dev/null +++ b/paddle/fluid/pybind/imperative.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace pybind { + +// Bind Methods +void BindTracer(pybind11::module *m) { + pybind11::class_(*m, "Tracer", "") + .def("__init__", + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); + }) + .def("trace", &imperative::Tracer::Trace) + .def("get_scope", &imperative::Tracer::GetScope, + pybind11::return_value_policy::reference); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h new file mode 100644 index 0000000000..7a9d3a01ea --- /dev/null +++ b/paddle/fluid/pybind/imperative.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/fluid/imperative/layer.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +class PyLayer : public imperative::Layer { + public: + using imperative::Layer::Layer; // Inherit constructors + + std::vector Forward( + const std::vector& inputs) override { + PYBIND11_OVERLOAD(std::vector, Layer, Forward, + inputs); // NOLINT + } + + void Backward() override { + PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT + } +}; + +class PyOpBase : public imperative::OpBase { + public: + using imperative::OpBase::OpBase; // Inherit constructors +}; + +class PyVarBase : public imperative::VarBase { + public: + using imperative::VarBase::VarBase; // Inherit constructors +}; + +void BindTracer(pybind11::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d297..ea07372a28 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -45,6 +46,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) { BindException(&m); + py::class_(m, "VarBase", R"DOC()DOC") + .def(py::init<>()) + .def("_run_backward", + [](imperative::VarBase &self, framework::Scope *scope) { + self.RunBackward(scope); + }) + .def("_grad", &imperative::VarBase::Grad) + .def_property( + "desc", + [](const imperative::VarBase &self) { return self.var_desc_; }, + [](imperative::VarBase &self, framework::VarDesc *var_desc) { + self.var_desc_ = var_desc; + }, + py::return_value_policy::reference); + + py::class_(m, "OpBase", R"DOC()DOC") + .def(py::init<>()) + .def_property( + "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, + [](imperative::OpBase &self, framework::OpDesc *op_desc) { + if (op_desc) { + self.op_desc_ = op_desc; + } + }, + py::return_value_policy::reference); + + py::class_ layer(m, "Layer"); + layer.def(py::init<>()) + .def("forward", + [](imperative::Layer &self, + const std::vector &inputs) { + return self.Forward(inputs); + }) + .def("backward", &imperative::Layer::Backward); + BindTracer(&m); + py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -601,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a53519188..52417a1eaf 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,6 +34,7 @@ from . import io from . import evaluator from . import initializer from . import layers +from . import imperative from . import contrib from . import nets from . import optimizer @@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', + 'imperative', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b156db53d2..9e6345f148 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,6 +18,7 @@ import collections import contextlib import re import six +import sys import numpy as np @@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() +_imperative_tracer_ = None + + +def _in_imperative_mode(): + return _imperative_tracer_ is not None + + +def _imperative_tracer(): + return _imperative_tracer_ + class NameScope(object): def __init__(self, name="", parent=None): @@ -202,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(object): +class Variable(core.VarBase): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -266,6 +277,7 @@ class Variable(object): stop_gradient=False, is_data=False, **kwargs): + core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -346,6 +358,18 @@ class Variable(object): self.stop_gradient = stop_gradient self.is_data = is_data + def _numpy(self): + scope = _imperative_tracer().get_scope(self.block.desc) + tensor = core.get_variable_tensor(scope, self.desc.name()) + return np.array(tensor) + + def _backward(self): + scope = _imperative_tracer().get_scope(self.block.desc) + self._run_backward(scope) + + def _gradient(self): + return np.array(self._grad()) + def __str__(self): return self.to_string(True) @@ -492,7 +516,7 @@ class OpProtoHolder(object): } -class Operator(object): +class Operator(core.OpBase): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -548,6 +572,7 @@ class Operator(object): inputs=None, outputs=None, attrs=None): + core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -587,6 +612,7 @@ class Operator(object): return True return False + self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -613,6 +639,13 @@ class Operator(object): else: self.desc.set_input(in_proto.name, []) + for inp in inputs.values(): + if isinstance(inp, Variable): + self.inputs.append(inp) + elif isinstance(inp, list) or isinstance(inp, tuple): + self.inputs.extend(inp[:]) + + self.outputs = [] if outputs is not None: given = set() need = set() @@ -641,6 +674,12 @@ class Operator(object): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) + for out in outputs.values(): + if isinstance(out, Variable): + self.outputs.append(out) + elif isinstance(out, list) or isinstance(out, tuple): + self.outputs.extend(out[:]) + if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -1206,6 +1245,8 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -2209,3 +2250,12 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) + + +@contextlib.contextmanager +def _imperative_guard(tracer): + global _imperative_tracer_ + tmp_trace = _imperative_tracer_ + _imperative_tracer_ = tracer + yield + _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py new file mode 100644 index 0000000000..922308b6b1 --- /dev/null +++ b/python/paddle/fluid/imperative/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import base +from .base import * + +from . import layers +from .layers import * + +__all__ = [] +__all__ += layers.__all__ +__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py new file mode 100644 index 0000000000..15d38ddb56 --- /dev/null +++ b/python/paddle/fluid/imperative/base.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework + +__all__ = ['enabled', 'guard', 'to_variable'] + + +def enabled(): + return framework._in_imperative_mode() + + +@contextlib.contextmanager +def guard(): + train = framework.Program() + startup = framework.Program() + tracer = core.Tracer(train.current_block().desc) + with framework.program_guard(train, startup): + with framework.unique_name.guard(): + with framework._imperative_guard(tracer): + yield + + +def to_variable(value, block=None): + if isinstance(value, np.ndarray): + if not block: + block = framework.default_main_program().current_block() + py_var = framework.Variable( + block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=value.shape, + dtype=value.dtype) + scope = framework._imperative_tracer().get_scope(block.desc) + var = scope.var(py_var.name) + tensor = var.get_tensor() + tensor.set(value, core.CPUPlace()) + return py_var + elif isinstance(value, framework.Variable): + return value + else: + raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py new file mode 100644 index 0000000000..1a28f7f4ae --- /dev/null +++ b/python/paddle/fluid/imperative/layers.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import sys +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework +from paddle.fluid.imperative import base + +__all__ = ['PyLayer'] + + +class PyLayer(core.Layer): + def __init__(self): + pass + + def __call__(self, inputs): + # TODO(panyx0718): Support declarative mode as well. + assert base.enabled() + if not isinstance(inputs, list) and not isinstance(inputs, tuple): + inputs = [inputs] + + var_inputs = [] + for x in inputs: + py_var = base.to_variable(x) + var_inputs.append(py_var) + outputs = self.forward(var_inputs) + return outputs + + def forward(self, inputs): + return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index dc317de9ab..74b4a977db 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,10 +17,13 @@ from __future__ import print_function import copy import itertools import six +import sys +import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier +from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -46,23 +49,21 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() + def to_variable(self, x): + return base.to_variable(x, self.main_program.current_block()) + def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - type_error = TypeError( - "Input of {0} layer should be Variable or sequence of Variable". - format(self.layer_type)) - if isinstance(inputs, Variable): - inputs = [inputs] - elif not isinstance(inputs, list) and not isinstance(inputs, tuple): - raise type_error + ret = [] + if isinstance(inputs, list) or isinstance(inputs, tuple): + for inp in inputs: + ret.append(self.to_variable(inp)) else: - for each in inputs: - if not isinstance(each, Variable): - raise type_error - return inputs + ret.append(self.to_variable(inputs)) + return ret def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4833212d31..fac7538a6a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6623,7 +6623,8 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) + helper.append_op( + type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py new file mode 100644 index 0000000000..b5b6305155 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import sys +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import core + + +class MyLayer(fluid.imperative.PyLayer): + def __init__(self): + super(MyLayer, self).__init__() + + def forward(self, inputs): + x = fluid.layers.relu(inputs[0]) + self._x_for_debug = x + return [fluid.layers.elementwise_mul(x, x)] + + +class TestImperative(unittest.TestCase): + def test_layer(self): + with fluid.imperative.guard(): + cl = core.Layer() + cl.forward([]) + l = fluid.imperative.PyLayer() + l.forward([]) + + def test_layer_in_out(self): + with fluid.imperative.guard(): + l = MyLayer() + x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + self.assertIsNotNone(x) + sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + x._backward() + sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 5aee26b638..0eb69cdb5c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,6 +101,7 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', + 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 5c5266f904..7e61dde0a4 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -27,6 +27,8 @@ import pydoc member_dict = collections.OrderedDict() +experimental_namespace = {"paddle.fluid.imperative"} + def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -51,6 +53,8 @@ def visit_member(parent_name, member): def visit_all_module(mod): + if (mod.__name__ in experimental_namespace): + return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) From b653ed05163e9f6d47208d5f46bee18ec57a2645 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 7 Dec 2018 13:53:31 +0800 Subject: [PATCH 165/719] add prefetch and remvoe selectedrows of bias --- paddle/fluid/operators/nce_op.cc | 8 +-- paddle/fluid/operators/nce_op.h | 47 ++++----------- python/paddle/fluid/layers/nn.py | 9 ++- .../tests/unittests/test_dist_transpiler.py | 59 +++++++++++++++++-- .../fluid/transpiler/distribute_transpiler.py | 3 +- 5 files changed, 75 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 06ff825fde..0a0be24a54 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -243,24 +243,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); - auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front(); auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to SelectedRows"; block->Var(weight_grad) ->SetType(framework::proto::VarType::SELECTED_ROWS); - block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to LoDTensor"; block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); - block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); - block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType()); } }; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index afb14c3071..6567b6534a 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -297,18 +297,19 @@ class NCEGradKernel : public framework::OpKernel { sample_grad_data[i] *= d_out_data[sample_idx]; } + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T *d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + bool is_sparse = context.Attr("is_sparse"); if (!is_sparse) { - // get d_bias - auto d_bias = context.Output(framework::GradVarName("Bias")); - if (d_bias != nullptr) { - T *d_bias_data = d_bias->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; - } - } // get d_w auto d_w = context.Output(framework::GradVarName("Weight")); if (d_w != nullptr) { @@ -330,34 +331,6 @@ class NCEGradKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto *bias_var = context.InputVar("Bias"); - DDim bias_dim; - if (bias_var->IsType()) { - bias_dim = context.Input("Bias")->dims(); - } else if (bias_var->IsType()) { - auto *table_t = context.Input("Bias"); - bias_dim = table_t->value().dims(); - } else { - PADDLE_THROW( - "The parameter Bias of a NCE_OP " - "must be either LoDTensor or SelectedRows"); - } - - auto d_bias = - context.Output(framework::GradVarName("Bias")); - d_bias->set_rows(labels); - d_bias->set_height(bias_dim[0]); - - d_bias->mutable_value()->Resize( - {static_cast(labels.size()), bias_dim[1]}); - T *d_bias_data = - d_bias->mutable_value()->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + labels.size(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[d_bias->Index(sample_labels_data[i])] += - sample_grad_data[i]; - } - auto *table_var = context.InputVar("Weight"); DDim table_dim; if (table_var->IsType()) { diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 28b8ae895a..9401ffc2b1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -24,7 +24,7 @@ from ..initializer import Normal, Constant from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ -from .tensor import concat +from .tensor import concat, assign from . import utils from .. import unique_name from functools import reduce @@ -4770,12 +4770,17 @@ def nce(input, else: num_neg_samples = int(num_neg_samples) + remote_prefetch = False + if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): + remote_prefetch = True + attrs = { 'num_total_classes': int(num_total_classes), 'num_neg_samples': num_neg_samples, 'seed': seed, 'sampler': sampler, - 'is_sparse': is_sparse + 'is_sparse': is_sparse, + 'remote_prefetch': remote_prefetch } helper.append_op( diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 194387bc98..48bac52654 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -14,14 +14,15 @@ from __future__ import print_function +import traceback import math +import collections +import six import unittest +import numpy as np + import paddle.fluid as fluid -from paddle.fluid.transpiler.distribute_transpiler import delete_ops -import traceback -import collections -import six class TranspilerTest(unittest.TestCase): @@ -823,5 +824,55 @@ class TestRemoteLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +# test for remote prefetch +class TestRemoteNce(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 20 + sampler = "uniform" + nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='nce_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='nce_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.nce(input=input, + label=label, + num_total_classes=num_total_classes, + sampler=sampler, + custom_dist=nid_freq_arr.tolist(), + sample_weight=None, + param_attr='nce_w', + bias_attr='nce_b', + seed=1, + num_neg_samples=5, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + for op in trainer.blocks[0].ops: + if op.type == "recv": + pass + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 817af602bd..9c526a0d8e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -242,8 +242,7 @@ class DistributeTranspiler(object): sparse_update_op_types = ["lookup_table", "nce"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( - 'remote_prefetch') is True and not op.attr( - 'is_distributed'): + 'remote_prefetch') is True: sparse_update_ops.append(op) return sparse_update_ops From 240d974ac5777b5e9d6c950bec91fe41e0aed093 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Fri, 7 Dec 2018 14:14:46 +0800 Subject: [PATCH 166/719] Clean Code test=develop --- paddle/fluid/framework/ir/CMakeLists.txt | 1 - .../ir/conv3d_bias_mkldnn_fuse_pass.cc | 18 ------------ .../ir/conv3d_bias_mkldnn_fuse_pass.h | 29 ------------------- .../ir/conv_bias_mkldnn_fuse_pass.cc | 2 ++ .../framework/ir/conv_bias_mkldnn_fuse_pass.h | 7 +++++ .../framework/ir/graph_pattern_detector.cc | 24 +++++++-------- .../fluid/operators/activation_mkldnn_op.cc | 4 +++ 7 files changed, 24 insertions(+), 61 deletions(-) delete mode 100644 paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc delete mode 100644 paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0bbfe3c0e2..883575e41d 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -46,7 +46,6 @@ if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) pass_library(conv_bias_mkldnn_fuse_pass inference) - pass_library(conv3d_bias_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) endif() diff --git a/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc deleted file mode 100644 index e2968ddf60..0000000000 --- a/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.cc +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h" - -REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass, - paddle::framework::ir::Conv3DBiasFusePass); diff --git a/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h deleted file mode 100644 index 5afe2cc61e..0000000000 --- a/paddle/fluid/framework/ir/conv3d_bias_mkldnn_fuse_pass.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once -#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" - -namespace paddle { -namespace framework { -namespace ir { -/* -* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. -*/ -class Conv3DBiasFusePass : public ConvBiasFusePass { - public: - bool is_conv3d() const override { return true; } -}; -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc index c3ad3a0f18..d4a701e0b1 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc @@ -137,3 +137,5 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( } // namespace paddle REGISTER_PASS(conv_bias_mkldnn_fuse_pass, paddle::framework::ir::ConvBiasFusePass); +REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass, + paddle::framework::ir::Conv3DBiasFusePass); diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h index c3b58bf582..f3ad9f1c2b 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h @@ -32,6 +32,13 @@ class ConvBiasFusePass : public FusePassBase { std::unique_ptr ApplyImpl(std::unique_ptr graph) const; const std::string name_scope_{"conv_bias_mkldnn_fuse"}; }; +/* +* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. +*/ +class Conv3DBiasFusePass : public ConvBiasFusePass { + public: + bool is_conv3d() const override { return true; } +}; } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index ed99d9883b..0118019df2 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1031,25 +1031,23 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( PDNode *patterns::ConvBias::operator()( paddle::framework::ir::PDNode *conv_input, bool is_conv3d) { + std::string type = is_conv3d ? "conv3d" : "conv2d"; // Create Operators - conv_input->assert_is_op_input(is_conv3d ? "conv3d" : "conv2d", "Input"); - auto *conv_op = pattern->NewNode(conv_repr()) - ->assert_is_op(is_conv3d ? "conv3d" : "conv2d"); + conv_input->assert_is_op_input(type, "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type); auto *eltiwse_op = pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); // Create variables // Filter - auto *conv_weight_var = - pattern->NewNode(conv_weight_repr()) - ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input(is_conv3d ? "conv3d" : "conv2d", "Filter"); + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input(type, "Filter"); // intermediate variable, will be removed in the IR after fuse. - auto *conv_out_var = - pattern->NewNode(conv_out_repr()) - ->AsIntermediate() - ->assert_is_only_output_of_op(is_conv3d ? "conv3d" : "conv2d") - ->assert_is_op_input("elementwise_add"); + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(type) + ->assert_is_op_input("elementwise_add"); // Bias stored in elementwise_add auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr()) ->AsInput() diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index 7fa81e185e..e16b6f78d1 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -100,6 +100,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx, const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE( + x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4, + "Input dim must be with 2, 3 or 4"); + std::vector src_tz = framework::vectorize2int(x->dims()); auto src_format = From 575ae7c6c3133df589cfe6c1a9d9e45e6bfc99c5 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 7 Dec 2018 14:30:10 +0800 Subject: [PATCH 167/719] refine pslib inferface & fix some bugs --- CMakeLists.txt | 4 +- paddle/fluid/framework/async_executor.cc | 29 +++++-- paddle/fluid/framework/async_executor.h | 6 +- .../fluid/framework/executor_thread_worker.cc | 31 ++++---- paddle/fluid/pybind/async_executor_py.cc | 4 +- python/paddle/fluid/async_executor.py | 11 ++- python/paddle/fluid/distributed/downpour.py | 2 +- python/paddle/fluid/distributed/ps_pb2.py | 78 +++++++++---------- 8 files changed, 100 insertions(+), 65 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fd8dd1dfa..5b5bf6c5b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,7 +217,7 @@ include(cupti) include(external/gzstream) endif (NOT WIN32) include(external/libmct) -#include(external/pslib_brpc) +include(external/pslib_brpc) include(external/pslib) if(WITH_DISTRIBUTE) @@ -280,7 +280,7 @@ set(EXTERNAL_LIBS zlib ${PYTHON_LIBRARIES} pslib - #pslib_brpc + pslib_brpc libmct ) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 292b05c588..7685883dd5 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -65,18 +65,35 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } -void AsyncExecutor::ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { +void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO done + _pslib_ptr->init_server(dist_desc, index);//TODO done + + InitParamConfig(); } -void AsyncExecutor::StartServer() { +void AsyncExecutor::InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { + _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); + _pslib_ptr->init_worker(dist_desc, host_sign_list.data(), node_num, index);//TODO done + InitParamConfig(); - _pslib_ptr->run_server(); +} + +uint64_t AsyncExecutor::StartServer() { + return _pslib_ptr->run_server(); +} + +void AsyncExecutor::GatherServers(std::vector& host_sign_list, int node_num) { + _pslib_ptr->gather_servers(host_sign_list.data(), node_num); } void AsyncExecutor::InitParamConfig() { - _param_config.fea_dim = _pslib_ptr->get_param()->trainer_param().sparse_table(0).feature_dim(); //TODO + for (int i = 0; i < _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param_size(); ++i) { + if (_pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).table_class().find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).accessor().fea_dim(); //TODO + break; + } + } _param_config.slot_dim = _param_config.fea_dim - 2; //TODO _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().pull_dense_per_batch()); _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); @@ -176,6 +193,7 @@ void AsyncExecutor::PrepareDenseThread() { param.dense_params = &_param_config.dense_variable_name; _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread->start(); } @@ -238,6 +256,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, fetch_var_names, root_scope_, thidx, debug); } + // start executing ops in multiple threads for (int thidx = 0; thidx < actual_thread_num; ++thidx) { threads.push_back( diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 21e4a66fce..90d6b46b2f 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -63,9 +63,11 @@ class AsyncExecutor { const std::vector& fetch_names, const bool debug = false); //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); - void ConfigPslib(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); + void InitServer(const std::string& dist_desc, int index); + void InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); //void ConfigWorker() {} - void StartServer(); + uint64_t StartServer(); + void GatherServers(std::vector& host_sign_list, int node_num); void InitModel(); void SaveModel(const std::string& path); void InitParamConfig(); diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index f7c05e400d..e0ee9c11c9 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -345,9 +345,12 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { if (op->Type().find("sgd") != std::string::npos) { continue; } + if (op->Type().find("lookup_table") != std::string::npos || + op->Type().find("lookup_table_grad") != std::string::npos) { + continue; + } op->Run(*thread_scope_, place_); } - UpdateParams(); } @@ -416,8 +419,8 @@ void AsyncExecutorThreadWorker::UpdateParams() { for (auto i: _param_config->dense_table_id) { PushDense(i); } - int32_t tmp_push_dense_wait_times = _param_config->tmp_push_dense_wait_times; //TODO - int32_t tmp_push_sparse_wait_times = _param_config->tmp_push_sparse_wait_times; //TODO + int32_t tmp_push_dense_wait_times = -1;//_param_config->tmp_push_dense_wait_times; //TODO + int32_t tmp_push_sparse_wait_times = -1;//_param_config->tmp_push_sparse_wait_times; //TODO static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); static uint32_t push_sparse_wait_times = static_cast(tmp_push_sparse_wait_times); @@ -430,7 +433,6 @@ void AsyncExecutorThreadWorker::UpdateParams() { if (tmp_push_dense_wait_times == -1) { _push_dense_status.resize(0); } - if (_push_sparse_status.size() >= push_sparse_wait_times) { for (auto& t : _push_sparse_status) { t.wait(); @@ -440,7 +442,6 @@ void AsyncExecutorThreadWorker::UpdateParams() { if (tmp_push_sparse_wait_times == -1) { _push_sparse_status.resize(0); } - //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO for (auto dense_table_id: _param_config->dense_table_id) { _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); @@ -451,8 +452,8 @@ void AsyncExecutorThreadWorker::UpdateParams() { void AsyncExecutorThreadWorker::PushDense(int table_id) { std::vector regions; //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; - std::vector variables; - for (auto& t : variables) { + //std::vector variables; + for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { Variable* var = thread_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); @@ -469,7 +470,6 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { void AsyncExecutorThreadWorker::PullSparse(int table_id) { - auto& features = _features[table_id]; auto& feature_value = _feature_value[table_id]; auto fea_dim = _param_config->fea_dim; //TODO @@ -477,7 +477,6 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.clear(); features.resize(0); features.reserve(MAX_FEASIGN_NUM); - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { @@ -493,14 +492,14 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.push_back(static_cast(ids[i])); } } - check_pull_push_memory(features, feature_value, fea_dim); std::vector pull_feature_value; for (auto i = 0u; i < features.size(); ++i) { pull_feature_value.push_back(feature_value[i].data()); } - + for (int i = 0; i < features.size(); ++i) { + } auto status = _pslib_ptr->_worker_ptr->pull_sparse( pull_feature_value.data(), table_id, features.data(), features.size()); _pull_sparse_status.push_back(std::move(status)); @@ -532,10 +531,15 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { LoDTensor* tensor = var->GetMutable(); int64_t* ids = tensor->data(); int len = tensor->numel(); - Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[table_id][slot_idx - 1]); LoDTensor* tensor_emb = var_emb->GetMutable(); - float* ptr = tensor_emb->data(); + float* ptr = tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * slot_dim); + auto& tensor_lod = tensor->lod()[0]; + + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + //float* ptr = tensor_emb->data(); for (auto index = 0u; index < len; ++index){ //if (_current_train_job.use_cvm_feature()) { @@ -576,7 +580,6 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { //} const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 63fd06224f..eca46fbad5 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -48,8 +48,10 @@ void BindAsyncExecutor(py::module* m) { new framework::AsyncExecutor(scope, place)); })) .def("run_from_files", &framework::AsyncExecutor::RunFromFile) - .def("config_pslib", &framework::AsyncExecutor::ConfigPslib) + .def("init_server", &framework::AsyncExecutor::InitServer) + .def("init_worker", &framework::AsyncExecutor::InitWorker) .def("start_server", &framework::AsyncExecutor::StartServer) + .def("gather_servers", &framework::AsyncExecutor::GatherServers) .def("init_model", &framework::AsyncExecutor::InitModel) .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index c5863eb9e0..f667ff2424 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -158,8 +158,17 @@ class AsyncExecutor(object): return + def init_server(self, filename, index): + self.executor.init_server(filename, index) + + def init_worker(self, filename, ips, nodes_cnt, index): + self.executor.init_worker(filename, ips, nodes_cnt, index) + def start_server(self): - self.executor.start_server() + return self.executor.start_server() + + def gather_servers(self, ips, nodes_cnt): + self.executor.gather_servers(ips, nodes_cnt) def init_model(self): self.executor.init_model() diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 093792d5d6..3d940b62b0 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -56,7 +56,7 @@ class DownpourSGD(object): params_grads[0], params_grads[1]) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) - ps_param.worker_param.CopyFrom(worker.get_desc()) + ps_param.trainer_param.CopyFrom(worker.get_desc()) # Todo(guru4elephant): figure out how to support more sparse parameters # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index 0ef34d6e18..f33ec50f7d 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='ps.proto', package='paddle', syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\x91\x01\n\x16ServerServiceParameter\x12\x14\n\x0cserver_class\x18\x01 \x01(\t\x12\x14\n\x0c\x63lient_class\x18\x02 \x01(\t\x12\x15\n\rservice_class\x18\x03 \x01(\t\x12\x19\n\x11start_server_port\x18\x04 \x01(\r\x12\x19\n\x11server_thread_num\x18\x05 \x01(\r\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd1\x01\n\x16ServerServiceParameter\x12(\n\x0cserver_class\x18\x01 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsServer\x12(\n\x0c\x63lient_class\x18\x02 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsClient\x12&\n\rservice_class\x18\x03 \x01(\t:\x0f\x41\x62\x61\x63usPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') ) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -41,8 +41,8 @@ _TABLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3198, - serialized_end=3250, + serialized_start=3262, + serialized_end=3314, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) @@ -108,8 +108,8 @@ _PSCMDID = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3253, - serialized_end=3570, + serialized_start=3317, + serialized_end=3634, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) @@ -148,8 +148,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3166, - serialized_end=3196, + serialized_start=3230, + serialized_end=3260, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) @@ -531,35 +531,35 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( _descriptor.FieldDescriptor( name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=_b("AbacusBrpcPsServer").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=_b("AbacusBrpcPsClient").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), + has_default_value=True, default_value=_b("AbacusPsService").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3, number=4, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, + has_default_value=True, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4, number=5, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, + has_default_value=True, default_value=12, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), @@ -576,7 +576,7 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( oneofs=[ ], serialized_start=1134, - serialized_end=1279, + serialized_end=1343, ) @@ -641,8 +641,8 @@ _TABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1282, - serialized_end=1473, + serialized_start=1346, + serialized_end=1537, ) @@ -721,8 +721,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1476, - serialized_end=1845, + serialized_start=1540, + serialized_end=1909, ) @@ -794,8 +794,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1848, - serialized_end=2054, + serialized_start=1912, + serialized_end=2118, ) @@ -839,8 +839,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2056, - serialized_end=2139, + serialized_start=2120, + serialized_end=2203, ) @@ -898,8 +898,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2141, - serialized_end=2242, + serialized_start=2205, + serialized_end=2306, ) @@ -950,8 +950,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2244, - serialized_end=2363, + serialized_start=2308, + serialized_end=2427, ) @@ -1009,8 +1009,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2366, - serialized_end=2591, + serialized_start=2430, + serialized_end=2655, ) @@ -1068,8 +1068,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2594, - serialized_end=2728, + serialized_start=2658, + serialized_end=2792, ) @@ -1106,8 +1106,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2730, - serialized_end=2796, + serialized_start=2794, + serialized_end=2860, ) @@ -1137,8 +1137,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2798, - serialized_end=2857, + serialized_start=2862, + serialized_end=2921, ) @@ -1168,8 +1168,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2859, - serialized_end=2905, + serialized_start=2923, + serialized_end=2969, ) @@ -1213,8 +1213,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2907, - serialized_end=2980, + serialized_start=2971, + serialized_end=3044, ) @@ -1287,8 +1287,8 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2983, - serialized_end=3196, + serialized_start=3047, + serialized_end=3260, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER From cb8a24be14f04c23fbc206d8c8537ff365b4e6bc Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:11:16 +0800 Subject: [PATCH 168/719] clean code --- .../framework/details/all_reduce_op_handle.cc | 12 +------ .../details/computation_op_handle.cc | 12 ++----- .../fluid/framework/details/op_handle_base.cc | 2 -- .../details/parallel_ssa_graph_executor.cc | 13 +++---- .../details/parallel_ssa_graph_executor.h | 4 +-- paddle/fluid/framework/parallel_executor.cc | 35 ++++--------------- paddle/fluid/platform/nccl_helper.h | 2 +- 7 files changed, 20 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ae17ea8a15..ae20338746 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,9 +46,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - int64_t start_ts = GetTS(); - int64_t func_ts = GetTS(); - VLOG(5) << "all_reduce_op_handle::RunImpl start"; platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, @@ -62,11 +59,7 @@ void AllReduceOpHandle::RunImpl() { return; // No need to all reduce when GPU count = 1; } else { // Wait input done - start_ts = GetTS(); WaitInputVarGenerated(); - VLOG(5) << "all_reduce_op_handle wait input var spent: " - << GetTS() - start_ts << " (ns)."; - start_ts = GetTS(); auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( @@ -107,8 +100,6 @@ void AllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; - VLOG(5) << "call allreduce: " << in_var_handles[i]->name_ - << " on dev: " << dev_id; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; @@ -118,6 +109,7 @@ void AllReduceOpHandle::RunImpl() { ncclSum, comm, stream)); }); } + this->RunAndRecordEvent([&] { // TODO(Yancey1989): need allreduce operator to avoid this flag if (nccl_ctxs_->need_group_call_) { @@ -162,8 +154,6 @@ void AllReduceOpHandle::RunImpl() { } } } - VLOG(5) << "all_reduce_op_handle Impl spent: " << GetTS() - func_ts - << " (ns)."; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; } diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 35ba99a879..7ad1e40c60 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -33,18 +33,10 @@ void ComputationOpHandle::RunImpl() { op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); }; - if (Name().compare("conv2d") || Name().compare("conv2d_grad")) { - int64_t start_ts = GetTS(); - auto varname = DynamicCast(this->Outputs())[0]->name_; + if (is_lock_and_record_event_free_) { run_func(); - VLOG(5) << Name() << "_op_handle: " << varname - << " spent: " << GetTS() - start_ts << " (ns)."; } else { - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); - } + this->RunAndRecordEvent(run_func); } } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index d68d1ce71d..4914e0a5ad 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,6 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - int64_t start_ts = 0; if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; @@ -125,7 +124,6 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event - VLOG(5) << "events not empty"; std::function method = callback; for (auto &p : dev_ctxes_) { method = [method, p, this]() { diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 72beb74aa4..dfb40721d8 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -21,19 +21,20 @@ namespace details { ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> graphs) + std::vector> &&graphs) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graphs_(std::move(graphs)), - pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr) { + graphs_(std::move(graphs)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + // do not use threadpool for each graph execution. + strategy_.num_threads_ = 1UL; for (size_t i = 0; i < places.size(); ++i) { - std::vector scopes = {local_scopes_[i]}; - std::vector places = {places_[i]}; executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, scopes, places, std::move(graphs_[i]))); + strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); } + VLOG(1) << "pool size: " << places_.size(); } FeedFetchList ParallelSSAGraphExecutor::Run( diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index c0ba1577f7..37784775f0 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -30,7 +30,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> graphs); + std::vector> &&graphs); ~ParallelSSAGraphExecutor() final = default; const ir::Graph &Graph() const override { return *graphs_[0]; } @@ -39,9 +39,9 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { private: ExecutionStrategy strategy_; std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; std::vector places_; std::vector> graphs_; - std::unique_ptr<::ThreadPool> pool_; std::vector> executors_; }; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ff3d76fb01..186f0cb803 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -54,7 +54,6 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; // not owned std::unique_ptr executor_; - std::vector> executors_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; @@ -142,6 +141,7 @@ ParallelExecutor::ParallelExecutor( std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + VLOG(1) << "kParallelGraph mode!!"; for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -222,38 +222,17 @@ ParallelExecutor::ParallelExecutor( } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - /** - for (size_t i = 0; i < member_->places_.size(); ++i) { - std::vector var_infos; - for (auto &node : graphs[i]->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } - } - - std::vector places = {member_->places_[i]}; - std::vector scopes = {member_->local_scopes_[i]}; - std::unique_ptr p(new - details::ThreadedSSAGraphExecutor( - exec_strategy, scopes, places, std::move(graphs[i]))); - - member_->executors_.push_back(std::move(p)); - - member_->executors_[i].reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, scopes, std::move(var_infos), places, - std::move(member_->executors_[i]))); - }**/ member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { member_->executor_.reset(new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, graphs)); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 53de53f43d..23a0222239 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -105,7 +105,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1 && nccl_id != nullptr) { + if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); From 97de98cd0a1240eacb573e8d117d0e4b928d82b0 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 7 Dec 2018 16:16:18 +0800 Subject: [PATCH 169/719] update bpr_loss op code, test=develop --- paddle/fluid/API.spec | 15 ++- paddle/fluid/operators/bpr_loss_op.cc | 35 ++++--- paddle/fluid/operators/bpr_loss_op.h | 92 +++++++------------ python/paddle/fluid/layers/nn.py | 2 +- .../fluid/tests/unittests/test_bpr_loss_op.py | 2 +- 5 files changed, 65 insertions(+), 81 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e273a852a9..9a90ad4e93 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -70,7 +77,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -176,7 +183,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -188,6 +195,9 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -292,6 +302,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 3e6445dbc2..41f2969e6c 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -23,19 +23,18 @@ class BprLossOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), - "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("LabelPos"), + "Input(LabelPos) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_Pos_dims = ctx->GetInputDim("Label_Pos"); + auto label_Pos_dims = ctx->GetInputDim("LabelPos"); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ( - rank, label_Pos_dims.size(), - "Input(X) and Input(Label_Pos) shall have the same rank."); + PADDLE_ENFORCE_EQ(rank, label_Pos_dims.size(), + "Input(X) and Input(LabelPos) shall have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(label_Pos_dims, 0, rank - 1), - "Input(X) and Input(Label_Pos) shall have the same shape " + "Input(X) and Input(LabelPos) shall have the same shape " "except the last dimension."); auto y_dims = x_dims; @@ -61,25 +60,25 @@ class BprLossGradientOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Label_Pos"), - "Input(Label_Pos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("LabelPos"), + "Input(LabelPos) should be not null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "Input(Y@GRAD) shoudl be not null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@GRAD) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_pos_dims = ctx->GetInputDim("Label_Pos"); + auto label_pos_dims = ctx->GetInputDim("LabelPos"); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); int rank = x_dims.size(); PADDLE_ENFORCE_EQ(dy_dims.size(), rank, "Input(Y@Grad) and Input(X) should have the same rank."); PADDLE_ENFORCE_EQ( label_pos_dims.size(), rank, - "Input(Label_Pos) and Input(X) should have the same rank."); + "Input(LabelPos) and Input(X) should have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(label_pos_dims, 0, rank - 1), - "The Input(X) and Input(Label_Pos) should have the same " + "The Input(X) and Input(LabelPos) should have the same " "shape except the last dimension."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(dy_dims, 0, rank - 1), @@ -88,7 +87,7 @@ class BprLossGradientOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, "The last dimension of Input(Y@Grad) should be 1."); PADDLE_ENFORCE_EQ(label_pos_dims[rank - 1], 1, - " the last dimension of Input(Label_Pos) should be 1."); + " the last dimension of Input(LabelPos) should be 1."); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->ShareLoD("X", framework::GradVarName("X")); } @@ -112,7 +111,7 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { "size is equal to the number of classes. This input is a " "real number."); AddInput( - "Label_Pos", + "LabelPos", "(Tensor), the tensor which represents the ground truth. It has the " "same shape with 'X' except the last dimension. the last dimension " "size is 1."); @@ -121,14 +120,14 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { "with 'X' except that the last dimension size is 1. It " "represents the sequence bpr loss."); AddComment(R"DOC( -BprLoss Operator. +Bayesian Personalized Ranking Loss Operator. -This operator belongs to pairwise ranking loss. Label_pos is the desired item. -The loss at a given point in one seesion is defined as: +This operator belongs to pairwise ranking loss. LabelPos is the desired item. +The loss at a given point in one session is defined as: $Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ Learn more details by reading paper . +neural networks>(https://arxiv.org/abs/1511.06939) )DOC"); } diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index 4103686de7..ea817bb239 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -39,22 +39,22 @@ class BprLossOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); - auto* labels_Pos = ctx.Input("Label_Pos"); + auto* label_pos = ctx.Input("LabelPos"); auto* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); int rank = x->dims().size(); Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); - Tensor labels_Pos_2d = framework::ReshapeToMatrix(*labels_Pos, rank - 1); + Tensor labels_Pos_2d = framework::ReshapeToMatrix(*label_pos, rank - 1); Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); - const framework::Tensor* prob = &x_2d; + const framework::Tensor* logits = &x_2d; const framework::Tensor* labels_pos = &labels_Pos_2d; framework::Tensor* out = &y_2d; - const int step_size = prob->dims()[0]; - const int class_num = prob->dims()[1]; - const T* prob_data = prob->data(); + const int step_size = logits->dims()[0]; + const int class_num = logits->dims()[1]; + const T* logits_data = logits->data(); T* loss_data = out->data(); const int64_t* label_pos_data = labels_pos->data(); @@ -68,73 +68,47 @@ class BprLossOpKernel : public framework::OpKernel { if (j == lbl_pos) continue; int index_neg = i * class_num + j; sum += TolerableValue()(-std::log( - 1.0f + TolerableValue()( - std::exp(prob_data[index_neg] - prob_data[index_pos])))); + 1.0f + TolerableValue()(std::exp(logits_data[index_neg] - + logits_data[index_pos])))); } loss_data[i] = -sum / (class_num - 1); } } }; -template -class XeGradFunctor { - public: - XeGradFunctor(T* dx, - const T* dy, // NOLINT - const T* x, // NOLINT - const int64_t* label_pos, // NOLINT - size_t num_classes) - : dx_(dx), - dy_(dy), - x_(x), - label_pos_(label_pos), - num_classes_(num_classes) {} - - HOSTDEVICE void operator()(size_t sample_id) { - for (size_t x_offset = sample_id * num_classes_; - x_offset < (sample_id + 1) * num_classes_; ++x_offset) { - dx_[x_offset] = static_cast(0); - } - auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; - for (size_t ni = 0; ni < num_classes_; ni++) { - if (label_pos_[sample_id] == ni) continue; - auto n_index = sample_id * num_classes_ + ni; - auto grad_ = - -dy_[sample_id] / - ((num_classes_ - 1) * - (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); - dx_[p_index] += grad_; - dx_[n_index] -= grad_; - } - } - - private: - T* dx_; - const T* dy_; - const T* x_; - const int64_t* label_pos_; - size_t num_classes_; -}; - template class BprLossGradientOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* dy = ctx.Input(framework::GradVarName("Y")); - auto* label_pos = ctx.Input("Label_Pos"); + auto* label_pos = ctx.Input("LabelPos"); auto* dx = ctx.Output(framework::GradVarName("X")); - T* dx_data = dx->mutable_data(ctx.GetPlace()); - int rank = x->dims().size(); - int64_t class_num = x->dims()[rank - 1]; - XeGradFunctor functor(dx_data, dy->data(), x->data(), - label_pos->data(), - static_cast(class_num)); - platform::ForRange for_range( - ctx.template device_context(), - static_cast(dy->numel())); - for_range(functor); + const int step_size = x->dims()[0]; + const int num_classes_ = x->dims()[1]; + T* dx_ = dx->mutable_data(ctx.GetPlace()); + const T* dy_ = dy->data(); + const T* x_ = x->data(); + const int64_t* label_pos_ = label_pos->data(); + + for (size_t sample_id = 0; sample_id < step_size; sample_id++) { + for (size_t x_offset = sample_id * num_classes_; + x_offset < (sample_id + 1) * num_classes_; x_offset++) { + dx_[x_offset] = static_cast(0); + } + auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; + for (size_t ni = 0; ni < num_classes_; ni++) { + if (label_pos_[sample_id] == ni) continue; + auto n_index = sample_id * num_classes_ + ni; + auto grad_ = + -dy_[sample_id] / + ((num_classes_ - 1) * + (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); + dx_[p_index] += grad_; + dx_[n_index] -= grad_; + } + } } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 06d7e429ae..3ba1883999 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1356,7 +1356,7 @@ def bpr_loss(input, label_pos): helper.append_op( type='bpr_loss', inputs={'X': [input], - 'Label_Pos': [label_pos]}, + 'LabelPos': [label_pos]}, outputs={'Y': [out]}) return out diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index 2af6461aed..d137f4a6fb 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -39,7 +39,7 @@ class TestBprLossOp1(OpTest): sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label_pos[i][0]]))) bpr_loss_result.append(-sum / (class_num - 1)) bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64") - self.inputs = {"X": X, "Label_Pos": label_pos} + self.inputs = {"X": X, "LabelPos": label_pos} self.outputs = {"Y": bpr_loss} def test_check_output(self): From a3381dc740e0e52139cfb9fc36b467cbf108ed64 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 7 Dec 2018 16:29:52 +0800 Subject: [PATCH 170/719] update question for api ,test=develop --- python/paddle/fluid/tests/unittests/test_bpr_loss_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index d137f4a6fb..80916f4a82 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -25,7 +25,7 @@ class TestBprLossOp1(OpTest): def setUp(self): self.op_type = "bpr_loss" - batch_size = 4 + batch_size = 40 class_num = 5 X = randomize_probability(batch_size, class_num, dtype='float64') label_pos = np.random.randint( From ab084b6e6caf02439cf2e3147876da76ab1bb604 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 7 Dec 2018 16:36:10 +0800 Subject: [PATCH 171/719] Polish code test=develop --- python/paddle/dataset/image.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 645d54a260..57547f1867 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -34,14 +34,15 @@ from __future__ import print_function import six import numpy as np -# NOTE(minqiyang): this is an ugly fix for the numpy bug reported here +# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here # https://github.com/numpy/numpy/issues/12497 if six.PY3: import subprocess import sys - import_cv2_proc = subprocess.Popen([sys.executable, "-c", "import cv2"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + import_cv2_proc = subprocess.Popen( + [sys.executable, "-c", "import cv2"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) out, err = import_cv2_proc.communicate() retcode = import_cv2_proc.poll() if retcode != 0: From 220db4f334a06bda1b9967740d9fd96806fc461b Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:38:08 +0800 Subject: [PATCH 172/719] clean code --- .../fluid/framework/details/build_strategy.cc | 1 - .../details/multi_devices_graph_pass.cc | 3 -- paddle/fluid/framework/parallel_executor.h | 2 - paddle/fluid/framework/scope.cc | 5 +-- paddle/fluid/framework/threadpool.cc | 15 ++----- paddle/fluid/framework/threadpool.h | 2 +- paddle/fluid/framework/threadpool_test.cc | 44 ------------------- .../fluid/operators/reader/blocking_queue.h | 3 -- .../fluid/operators/reader/buffered_reader.cc | 3 -- .../reader/create_double_buffer_reader_op.cc | 13 +----- 10 files changed, 7 insertions(+), 84 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 04c1061536..1e1b945f63 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -118,7 +118,6 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(5) << "run pass: " << pass->Type(); if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); pass->SetNotOwned>("places", &places); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 1bd238357a..c16e3006d7 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -329,7 +329,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { - VLOG(5) << "op name: " << node->Op()->Type(); if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { @@ -366,11 +365,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // is true only for the op that scale the final scalar loss. // It also assumes backward op will always follow the forward op in // the block. - VLOG(5) << "this is loss scale op!"; is_forwarding = false; } else { int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); - VLOG(5) << "on device id: " << op_dev_id; if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 319701f1eb..ef09b98b2a 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -20,8 +20,6 @@ limitations under the License. */ #include #include -#include "ThreadPool.h" - #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/executor.h" diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 873f68e42e..0d261dd7cc 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -58,10 +58,7 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } -Scope::~Scope() { - VLOG(5) << "~Scope()"; - DropKids(); -} +Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { SCOPE_LOCK_GUARD diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 7dc7430c55..d34f826c1a 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -48,18 +48,9 @@ void ThreadPool::Init() { ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); - for (int i = 0; i < num_threads; ++i) { - // for (auto& thread : threads_) { + for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number - threads_[i].reset( - new std::thread(std::bind(&ThreadPool::TaskLoop, this, i))); - /** - sched_param sch; - int policy; - pthread_getschedparam(threads_[i]->native_handle(), &policy, &sch); - if (pthread_setschedparam(threads_[i]->native_handle(), SCHED_FIFO, &sch)) { - VLOG(1) << "Failed to setschedparam: " << errno; - }**/ + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); } } @@ -77,7 +68,7 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::TaskLoop(int i) { +void ThreadPool::TaskLoop() { while (true) { Task task; diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index bd8c3cdee8..5177b7ee02 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -99,7 +99,7 @@ class ThreadPool { // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. - void TaskLoop(int i); + void TaskLoop(); // Init is called by GetInstance. static void Init(); diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 1257a76e3e..884d61e234 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -59,47 +59,3 @@ TEST(ThreadPool, ConcurrentRun) { } EXPECT_EQ(sum, ((n + 1) * n) / 2); } -static int64_t GetTS() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000000 + tp.tv_usec; -} - -void multi_call(std::function call) { - for (int i = 0; i < 500; ++i) { - call(); - } -} - -TEST(ThreadPool, PERFORMANCE) { - auto sum = [] { - int a = 0; - for (int i = 0; i < 1000; ++i) { - a += i; - } - }; - // framework::ThreadPool *pool = new framework::ThreadPool(2); - int64_t start = GetTS(); - for (int i = 0; i < 1000; ++i) { - // int64_t s = GetTS(); - framework::Async(std::move(sum)); - // pool->Run(std::move(sum)); - // VLOG(5) << "push to pool spent : " << GetTS() - s << " (us)."; - } - VLOG(5) << "pool spent: " << GetTS() - start << " (us)."; - start = GetTS(); - for (int i = 0; i < 1000; ++i) { - sum(); - } - VLOG(5) << "sequence call spent: " << GetTS() - start << " (us)."; - std::vector threads; - start = GetTS(); - for (int i = 0; i < 2; ++i) { - std::thread t(multi_call, std::ref(sum)); - threads.push_back(std::move(t)); - } - for (auto& thread : threads) { - thread.join(); - } - VLOG(5) << "two threads spent: " << GetTS() - start << " (us)."; -} diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 10de11bfa5..51b980acb5 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -67,12 +67,9 @@ class BlockingQueue { } bool Receive(T* elem) { - VLOG(1) << "blocking queue::Receive ..."; std::unique_lock lock(mutex_); receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; }); - VLOG(1) << "queue_.empty()=" << queue_.empty(); if (!queue_.empty()) { - if (elem == nullptr) VLOG(1) << "elem is nullptr"; PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); if (LIKELY(!speed_test_mode_)) { diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 2d66000f1f..cfa192f8e1 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -82,13 +82,11 @@ void BufferedReader::StartImpl() { } void BufferedReader::ReadNextImpl(std::vector *out) { - VLOG(1) << "ReadNextImpl start on place: " << place_; if (position_.empty()) { out->clear(); return; } size_t i = position_.front().get(); - VLOG(1) << "position front: " << i; position_.pop(); if (i == -1UL) { @@ -105,7 +103,6 @@ void BufferedReader::ReadNextImpl(std::vector *out) { ReadAsync(prev_pos_); } prev_pos_ = i; - VLOG(1) << "success ReadNextImpl"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 924c92e0bf..954fec0fbc 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,15 +25,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - VLOG(1) << "find var in scope: " << &scope; - auto* out_var = scope.FindVar(Output("Out")); - VLOG(1) << "var " << Output("Out") << " -> " << out_var; - auto* out = out_var->GetMutable(); - - // auto* out = scope.Var(Output("Out")) - // ->template GetMutable(); + auto* out = scope.Var(Output("Out")) + ->template GetMutable(); if (out->Get() != nullptr) { - VLOG(1) << Output("Out") << " is not nullptr."; return; } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) @@ -52,11 +46,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } - VLOG(1) << "create buffered reader on " << place; out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); - VLOG(1) << "Reset Buffered Reader in var: " - << scope.FindVar(Input("UnderlyingReader")); } }; From 73edf1376758b753ca7226cc22c442ef2f6c575d Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 16:41:53 +0800 Subject: [PATCH 173/719] update --- paddle/fluid/operators/reader/create_double_buffer_reader_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 954fec0fbc..440b16cf91 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -25,7 +25,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - auto* out = scope.Var(Output("Out")) + auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); if (out->Get() != nullptr) { return; From 47740ace289721e61489f6b2b5c196f26250aa3f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 7 Dec 2018 17:18:45 +0800 Subject: [PATCH 174/719] fix performance --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ae20338746..6b7bbf9003 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -107,6 +107,7 @@ void AllReduceOpHandle::RunImpl() { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); + if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream); }); } From 9fa87b23d950b6bb1abe9ad6ca39ae6800293338 Mon Sep 17 00:00:00 2001 From: chengduo Date: Fri, 7 Dec 2018 17:22:26 +0800 Subject: [PATCH 175/719] Fix io doc (#14751) * fix io doc test=develop * refine data_feeder doc test=develop --- python/paddle/fluid/data_feeder.py | 11 +++++++---- python/paddle/fluid/io.py | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 5102a558fd..13d2893fd1 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -258,10 +258,13 @@ class DataFeeder(object): multiple mini-batches. Each mini-batch will be feed on each device. Args: - reader(fun): the input data. - multi_devices(bool): the number of places. Default None. - num_places(int): the number of places. Default None. - drop_last(bool): the number of places. Default None. + reader(function): the reader is the function which can generate data. + multi_devices(bool): whether to use multiple devices or not. + num_places(int): if the multi_devices is True, you can specify the number + of GPU to use, if 'num_places' is None, the function will use all the + GPU of the current machine. Default None. + drop_last(bool): whether to drop the last batch if the + size of the last batch is less than batch_size. Default True. Returns: dict: the result of conversion. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 0782933c6c..e74a87fc68 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -145,7 +145,7 @@ def save_vars(executor, prog = fluid.default_main_program() fluid.io.save_vars(executor=exe, dirname=path, main_program=prog, - vars=None) + vars=None, predicate = name_has_fc) # All variables in `main_program` whose name includes "fc" will be saved. # And variables are going to be saved separately. @@ -369,7 +369,7 @@ def load_vars(executor, prog = fluid.default_main_program() fluid.io.load_vars(executor=exe, dirname=path, main_program=prog, - vars=None) + vars=None, predicate=name_has_fc) # All variables in `main_program` whose name includes "fc" will be loaded. # And all the variables are supposed to have been saved in differnet files. From c80c693a0f41c6f05f4945225c472d9e743424f0 Mon Sep 17 00:00:00 2001 From: chengduo Date: Fri, 7 Dec 2018 17:22:38 +0800 Subject: [PATCH 176/719] Enable test_gradient_clip in mac (#14765) * enable test_gradient_clip in mac test=develop * refine test_gradient_clip test=develop --- .../paddle/fluid/tests/unittests/CMakeLists.txt | 3 +-- .../fluid/tests/unittests/test_gradient_clip.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 61cfdb80af..a4089ba3ca 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -43,14 +43,13 @@ if(APPLE) list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_program_code) endif(NOT WITH_DISTRIBUTE) - message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") + message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) # TODO: add the unitest back when it fixed list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) - list(REMOVE_ITEM TEST_OPS test_gradient_clip) endif() if(NOT WITH_MKLML) # this op is not support on openblas diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index e4b3168ba6..e49239da6d 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -20,9 +20,6 @@ import paddle import paddle.fluid.core as core import paddle.fluid as fluid -BATCH_SIZE = 128 -CLIP = 1 - def bow_net(data, label, @@ -64,6 +61,8 @@ class TestGradientClip(unittest.TestCase): return places def check_operators(self, place): + CLIP = 1 + prog = fluid.framework.Program() startup_program = fluid.framework.Program() with fluid.program_guard( @@ -79,13 +78,13 @@ class TestGradientClip(unittest.TestCase): avg_cost = fluid.layers.mean(cost) prog_clip = prog.clone() - avg_cost_clip = prog_clip.block(0).var(avg_cost.name) p_g = fluid.backward.append_backward(loss=avg_cost) p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) - with fluid.program_guard(main_program=prog_clip): + with fluid.program_guard( + main_program=prog_clip, startup_program=startup_program): fluid.clip.set_gradient_clip( fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) @@ -96,7 +95,7 @@ class TestGradientClip(unittest.TestCase): train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), - batch_size=BATCH_SIZE) + batch_size=128) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[image, label], place=place) @@ -112,12 +111,12 @@ class TestGradientClip(unittest.TestCase): feed=feeder.feed(data), fetch_list=grad_clip_list) global_norm = 0 - for v in out[1:]: + for v in out: global_norm += np.sum(np.power(v, 2)) global_norm = np.sqrt(global_norm) global_norm_clip = 0 - for v in out_clip[1:]: + for v in out_clip: global_norm_clip += np.sum(np.power(v, 2)) global_norm_clip = np.sqrt(global_norm_clip) From 527946df490df1ad80152ffdc973178b9ae308f6 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 7 Dec 2018 18:08:29 +0800 Subject: [PATCH 177/719] add scope in prefetch --- .../distributed/parameter_prefetch.cc | 19 +++++++------- paddle/fluid/operators/lookup_table_op.h | 3 ++- paddle/fluid/operators/nce_op.h | 25 ++++++++++++++----- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index cf14538b1c..67b56bd218 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -102,8 +102,9 @@ static void MergeMultipleVarsIntoOneBySection( const std::string& out_name, const std::vector& out_var_names, const std::vector& height_section, const std::vector>& splited_ids, - const framework::ExecutionContext& context, framework::Scope* scope, - platform::DeviceContext* actual_ctx) { + const framework::ExecutionContext& context, + const framework::Scope& actual_scope, framework::Scope* scope, + platform::DeviceContext* actual_ctx, ) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); auto cpu_place = platform::CPUPlace(); @@ -114,9 +115,9 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = scope->FindVar(id_name)->Get(); + auto& id_tensor = actual_scope.FindVar(id_name)->Get(); auto* out_tensor = - scope->FindVar(out_name)->GetMutable(); + actual_scope.FindVar(out_name)->GetMutable(); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -172,8 +173,9 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context) { - auto& local_scope = context.scope().NewScope(); + const framework::ExecutionContext& context, + const framework::Scope& scope) { + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -245,9 +247,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, &local_scope, &actual_ctx); - - context.scope().DeleteScope(&local_scope); + context, scope, &local_scope, &actual_ctx); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 3a73a7637c..a7d0fd4856 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 6567b6534a..9789e30388 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -170,18 +170,31 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - local_scope.Var("Ids"); - local_scope.Var("Weight"); + auto *ids = local_scope.Var("Ids"); + auto *x_tensor = ids->GetMutable(); + x_tensor->mutable_data( + framework::make_ddim({static_cast(labels.size()), 1}), + context.GetPlace()); + // copy. + std::memcpy(x_tensor->data(), labels.data(), + labels.size() * sizeof(int64_t)); + + local_scope.Var("Weight@Local") + ->GetMutable() + ->mutable_data(context.GetPlace()); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids", "Weight", table_names, epmap, - height_sections, context); + operators::distributed::prefetch("Ids", "Weight@Local", table_names, + epmap, height_sections, context, + &local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " "parameter prefetch!"); +#endif - auto weight_mat = EigenMatrix::From(*(weight->Get())); + auto weight_mat = EigenMatrix::From( + (local_scope.Var("Weight@Local")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); @@ -196,7 +209,7 @@ class NCEKernel : public framework::OpKernel { } context.scope().DeleteScope(&local_scope); -#endif + } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); From eb8252466b11bdbea7abca6fd4cc5816f1c30830 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 09:15:23 +0000 Subject: [PATCH 178/719] polish code add unittest model containing while_op remove unnecessary codes test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 +- paddle/fluid/framework/details/CMakeLists.txt | 5 +- .../details/eager_deletion_op_handle.cc | 48 +++--- .../details/eager_deletion_op_handle.h | 8 +- .../framework/details/eager_deletion_pass.cc | 18 +- .../fluid/framework/details/op_graph_view.cc | 1 + .../framework/details/reference_count_pass.cc | 156 +++++++++++------- .../details/reference_count_pass_helper.cc | 21 +++ .../details/reference_count_pass_helper.h | 4 +- .../scope_buffered_ssa_graph_executor.cc | 21 +-- .../scope_buffered_ssa_graph_executor.h | 6 - paddle/fluid/framework/executor.cc | 56 ++++--- paddle/fluid/framework/garbage_collector.cc | 89 ++++++++++ paddle/fluid/framework/garbage_collector.h | 153 ++++++----------- paddle/fluid/framework/parallel_executor.cc | 28 ++-- paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/tensor.h | 4 + .../unittests/test_eager_deletion_gru_net.py | 49 ++++++ .../unittests/test_eager_deletion_lstm_net.py | 111 +++++++++++++ 19 files changed, 516 insertions(+), 268 deletions(-) create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.cc create mode 100644 paddle/fluid/framework/garbage_collector.cc create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c701a2ad63..f2361c5cea 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) +cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory) + cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) @@ -164,7 +166,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 8049f5d3f7..a6c8ef408a 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -33,9 +33,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base) +cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) -cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 54715fed8d..3b27415e43 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -26,8 +26,8 @@ namespace details { EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, const Scope *scope, const platform::Place &place, - const std::unordered_set &var_names, - GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts) + const std::unordered_set &var_names, GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) : OpHandleBase(node), scope_(scope), var_names_(var_names), @@ -35,9 +35,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ref_cnts_(ref_cnts) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { - dev_ctx_ = static_cast( + dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); - if (dynamic_cast *>(gc_)) { + if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard( boost::get(place).device); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); @@ -61,10 +61,11 @@ std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; + std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); - if (it == ref_cnts_->end()) { + // Var not found, not reference count has not decreased to 0 + if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { continue; } @@ -73,43 +74,44 @@ void EagerDeletionOpHandle::RunImpl() { continue; } + VLOG(2) << "Erase variable " << name; + if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - tensors.emplace_back(var->GetMutable()); - } + garbages.emplace_back(var->GetMutable()->MoveMemory()); } else if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - tensors.emplace_back(var->GetMutable()->mutable_value()); - } + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemory()); } else if (var->IsType()) { - if (it->second.fetch_sub(1) == 1) { - auto *tensor_arr = var->GetMutable(); - for (auto &t : *tensor_arr) { - tensors.emplace_back(&t); - } + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + garbages.emplace_back(t.MoveMemory()); } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); } } - if (!tensors.empty()) { - ClearTensors(tensors); + if (!garbages.empty()) { + ClearGarbages(&garbages); } } -void EagerDeletionOpHandle::ClearTensors(const std::vector &tensors) { +void EagerDeletionOpHandle::ClearGarbages( + std::deque> *garbages) { #ifdef PADDLE_WITH_CUDA if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = - static_cast *>(gc_)->stream(); + reinterpret_cast(gc_)->stream(); auto callback_func = [=]() { PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); }; - gc_->Add(tensors, callback_func); + gc_->Add(std::move(*garbages), callback_func); } else { #endif - gc_->Add(tensors); + gc_->Add(std::move(*garbages)); #ifdef PADDLE_WITH_CUDA } #endif diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index d8de59cc4d..64867afad5 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -14,8 +14,8 @@ #pragma once +#include #include -#include #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" @@ -30,7 +30,7 @@ class EagerDeletionOpHandle : public OpHandleBase { EagerDeletionOpHandle(ir::Node *node, const Scope *scope, const platform::Place &place, const std::unordered_set &var_names, - GarbageCollector *gc, + GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts); ~EagerDeletionOpHandle(); @@ -41,11 +41,11 @@ class EagerDeletionOpHandle : public OpHandleBase { void RunImpl() override; private: - void ClearTensors(const std::vector &tensors); + void ClearGarbages(std::deque> *garbages); const Scope *scope_; std::unordered_set var_names_; - GarbageCollector *gc_; // not own + GarbageCollector *gc_; // not own AtomicReferenceCountMap *ref_cnts_; // not own #ifdef PADDLE_WITH_CUDA platform::CUDADeviceContext *dev_ctx_{nullptr}; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 85991c71e6..4e42d0b497 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -28,17 +28,21 @@ namespace details { std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { - const auto &vars = graph->Get(kGraphVars); - auto &ref_cnts = Get>(kRuntimeReferenceCount); + PADDLE_ENFORCE(ref_cnts.empty(), + "kRuntimeReferenceCount should be initialized here!"); + + const auto &vars = graph->Get(kGraphVars); + ref_cnts.resize(vars.size()); + const auto &last_live_ops = Get>(kLastLiveOpsOfVars); - auto &gcs = Get(kGarbageCollector); + const auto &gcs = Get(kGarbageCollector); const auto &places = Get>(kAllPlaces); - ref_cnts = std::vector(vars.size()); - + // a reverse map of last_live_ops + // i.e., last op --> variable names which can be deleted. std::unordered_map> op_vars_map; @@ -58,8 +62,8 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( auto *eager_deletion_node = graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); auto *eager_deletion_op = new EagerDeletionOpHandle( - eager_deletion_node, op->GetScope(), op->GetPlace(), - std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(), + eager_deletion_node, op->GetScope(), op->GetPlace(), var_names, + gcs.at(places[op->GetScopeIdx()]).get(), &(ref_cnts[op->GetScopeIdx()])); auto it = std::find_if( diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index b6b5ad42c4..d3865c2c29 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -42,6 +42,7 @@ void OpGraphView::Build(const std::vector &ops) { std::unordered_set OpGraphView::AllOps() const { std::unordered_set ret; + ret.reserve(preceding_ops_.size()); for (auto &pair : preceding_ops_) { ret.insert(pair.first); } diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index f2c9dfb524..13a042d8e6 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -29,15 +29,17 @@ namespace paddle { namespace framework { namespace details { -class OpRelationDetector { - public: +// A functor to shrink/remove operators who depend on other operators in a set +class ShrinkDepsOpFunctor { + private: enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; - explicit OpRelationDetector(const std::vector &all_ops) + public: + explicit ShrinkDepsOpFunctor(const std::vector &all_ops) : graph_(all_ops) {} template - OpSet MaxNoDepOps(const OpSet &op_set) const { + OpSet operator()(const OpSet &op_set) const { using KeyType = typename OpSet::key_type; static_assert( std::is_base_of(ops[i])); + ret.emplace(static_cast(ops[i])); } } return ret; @@ -59,7 +61,7 @@ class OpRelationDetector { private: std::vector> GetRelations( - const std::vector ops) const { + const std::vector &ops) const { std::unordered_map op_to_idx; for (size_t i = 0; i < ops.size(); ++i) { PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); @@ -112,6 +114,10 @@ class OpRelationDetector { const OpGraphView graph_; }; +/** + * Find the nearest downstream computation op handle. If the op is a + * computation op, just return itself. + */ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( OpHandleBase *op, size_t scope_idx) { std::queue q; @@ -134,33 +140,87 @@ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( return nullptr; } +static std::unordered_set +ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, + const ShrinkDepsOpFunctor &shrink_func, + bool *ok) { + // stage one. Get last op for variable. + std::unordered_set candidates; + { + if (var->PendingOps().empty() && var->GeneratedOp()) { + // No operator depends on this variable. So the last operator is the op + // who generates this variable. + candidates.emplace(var->GeneratedOp()); + } else { + candidates = var->PendingOps(); + } + + // No pending ops or generated op is nullptr + if (candidates.empty()) { + *ok = false; + return {}; + } + } + + // stage two. Try to cast them to computation op. + // return (*ok=false) when failed. + // + // The reason why we cannot make any types of op handle to be the last lived + // op is: + // some op handle may operate on many DeviceContext, however, our garbage + // collector can only wait one DeviceContext for now. So currently, we wait + // the nearest compute op. + std::unordered_set computation_op; + { + for (auto *op : candidates) { + auto *compute_op = + FindNextComputationOpHandleOrReturnItself(op, scope_idx); + if (compute_op == nullptr) { + *ok = false; + return {}; + } + computation_op.emplace(compute_op); + } + } + + // stage three. Try to shrink computation op if they depend on each other. + // Get the smallest set of the most ops. + *ok = true; + return shrink_func(computation_op); +} + +static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { - auto &vars = graph->Get(kGraphVars); auto &ref_cnts = Get>(kGlobalReferenceCount); auto &last_live_ops_of_vars = Get>(kLastLiveOpsOfVars); - last_live_ops_of_vars = std::vector(vars.size()); - ref_cnts = std::vector(vars.size()); + PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(), + "Last Live Ops and Reference Counts of vars should be " + "initialized at here."); - OpRelationDetector detector(ir::FilterByNodeWrapper(*graph)); + const auto &vars = graph->Get(kGraphVars); - for (size_t i = 0; i < vars.size(); ++i) { - for (auto &name_var_pair : vars[i]) { - if (name_var_pair.second.empty()) { - continue; - } + last_live_ops_of_vars.resize(vars.size()); + ref_cnts.resize(vars.size()); - const std::string &var_name = name_var_pair.first; - auto *last_ver_var = name_var_pair.second.back(); + ShrinkDepsOpFunctor shrink_func( + ir::FilterByNodeWrapper(*graph)); - VarDesc *var_desc = nullptr; - std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(), - [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + // Whether this variable can be reused or deleted? If not, we do not + // compute reference counts and dependencies. + VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second); if (var_desc == nullptr || var_desc->Persistable()) { continue; @@ -170,50 +230,20 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( if (var_type != proto::VarType::LOD_TENSOR && var_type != proto::VarType::SELECTED_ROWS && var_type != proto::VarType::LOD_TENSOR_ARRAY) { + // Var type cannot be deleted continue; } - std::unordered_set last_live_op; - auto add_last_live_op = [&](OpHandleBase *op) -> bool { - auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i); - if (compute_op) { - last_live_op.insert(compute_op); - return true; - } else { - return false; - } - }; - - bool can_delete = false; - auto &pending_ops = last_ver_var->PendingOps(); - if (pending_ops.empty()) { - auto *generated_op = last_ver_var->GeneratedOp(); - if (generated_op && add_last_live_op(generated_op)) { - can_delete = true; - } - } else { - can_delete = true; - for (auto *pending_op : pending_ops) { - if (!add_last_live_op(pending_op)) { - can_delete = false; - break; - } - } - } - - if (can_delete) { - size_t original_size = last_live_op.size(); - last_live_op = detector.MaxNoDepOps(last_live_op); - if (last_live_op.size() != original_size) { - VLOG(10) << "Shrink last living op number of " << var_name << " from " - << original_size << " to " << last_live_op.size(); - } - - PADDLE_ENFORCE(!last_live_op.empty(), - "Last living ops of %s cannot be empty", var_name); + bool ok; + auto result = ExtractComputationOpFromLastLivedVar( + name_var_pair.second.back(), i, shrink_func, &ok); - ref_cnts[i].emplace(var_name, last_live_op.size()); - last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op)); + if (ok) { + auto &var_name = name_var_pair.first; + PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", + var_name); + ref_cnts[i].emplace(var_name, result.size()); + last_live_ops_of_vars[i].emplace(var_name, std::move(result)); } } } diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc new file mode 100644 index 0000000000..89bd08c2d0 --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +namespace details {} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index eb534f9701..1c083dbf00 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -18,10 +18,10 @@ #include #include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace framework { @@ -35,7 +35,7 @@ using AtomicReferenceCountMap = std::unordered_map>; using GarbageCollectorMap = - std::map>>; + std::map>; const char kGlobalReferenceCount[] = "global_reference_count"; const char kRuntimeReferenceCount[] = "runtime_reference_count"; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index b8775fc329..57f6fc66c5 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -30,20 +30,7 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), var_infos_(std::move(var_infos)), - places_(std::move(places)) { - if (Graph().Has(details::kGarbageCollector)) { - gc_ = &(Graph().Get(details::kGarbageCollector)); - } -} - -void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() { - if (gc_) { - for (auto &gc_pair : *gc_) { - gc_pair.second->Wait(); - gc_pair.second->Reset(); - } - } -} + places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { @@ -83,19 +70,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams - for (auto &p : places_) { + for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - WaitAllGarbageCollectors(); for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } - } else { - WaitAllGarbageCollectors(); } - if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 6086a219e0..5e87e0bf50 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -21,11 +21,9 @@ #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" - namespace paddle { namespace framework { namespace details { @@ -50,8 +48,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; private: - void WaitAllGarbageCollectors(); - size_t drop_scope_counter_{0}; ExecutionStrategy strategy_; @@ -59,8 +55,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector var_infos_; std::vector places_; - - GarbageCollectorMap* gc_{nullptr}; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 04425a5983..767bbb524f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor.h" +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" @@ -83,31 +84,37 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { } static void DeleteUnusedTensors( - const Scope& scope, const OperatorBase* op, GarbageCollector* gc, + const Scope& scope, const OperatorBase* op, GarbageCollector* gc, std::unordered_map* ref_cnts) { - std::unordered_set erase_tensors; + std::deque> garbages; auto handler = [&](const VariableNameMap& name_map) { for (auto& name_pair : name_map) { for (auto& name : name_pair.second) { auto it = ref_cnts->find(name); if (it == ref_cnts->end()) continue; - if (--(it->second) == 0) { - auto* var = scope.FindVar(name); - if (var != nullptr) { - VLOG(2) << "Erase tensor \'" << name << "\'"; - if (var->IsType()) { - erase_tensors.insert(var->GetMutable()); - } else if (var->IsType()) { - erase_tensors.insert( - var->GetMutable()->mutable_value()); - } else if (var->IsType()) { - auto* lod_tensor_arr = var->GetMutable(); - for (auto& t : *lod_tensor_arr) { - erase_tensors.insert(&t); - } - } + if (--(it->second) != 0) { + continue; + } + auto* var = scope.FindVar(name); + if (var != nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << name; + if (var->IsType()) { + garbages.emplace_back(var->GetMutable()->MoveMemory()); + } else if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemory()); + } else if (var->IsType()) { + auto* lod_tensor_arr = var->GetMutable(); + for (auto& t : *lod_tensor_arr) { + garbages.emplace_back(t.MoveMemory()); } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); } } } @@ -116,8 +123,8 @@ static void DeleteUnusedTensors( handler(op->Inputs()); handler(op->Outputs()); - if (!erase_tensors.empty()) { - gc->Add(erase_tensors); + if (!garbages.empty()) { + gc->Add(std::move(garbages)); } } @@ -411,22 +418,22 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } int64_t max_memory_size = GetEagerDeletionThreshold(); - std::unique_ptr> gc; + std::unique_ptr gc; if (max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new UnsafeFastGPUGarbageCollector( + gc.reset(new UnsafeFastGPUGarbageCollector( boost::get(place_), max_memory_size)); } else { - gc.reset(new DefaultStreamGarbageCollector( + gc.reset(new DefaultStreamGarbageCollector( boost::get(place_), max_memory_size)); } } else if (platform::is_cpu_place(place_)) { #endif - gc.reset(new CPUGarbageCollector( - boost::get(place_), max_memory_size)); + gc.reset(new CPUGarbageCollector(boost::get(place_), + max_memory_size)); #ifdef PADDLE_WITH_CUDA } #endif @@ -442,7 +449,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } platform::DeviceContextPool::Instance().Get(place_)->Wait(); - if (gc) gc->Wait(); if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc new file mode 100644 index 0000000000..54d9d0dc01 --- /dev/null +++ b/paddle/fluid/framework/garbage_collector.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif +#include "paddle/fluid/framework/garbage_collector.h" + +namespace paddle { +namespace framework { + +GarbageCollector::GarbageCollector(const platform::Place &place, + size_t max_memory_size) + : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { + garbages_.reset(new GarbageQueue()); + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); +} + +CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CPUGarbageCollector::ClearCallback(const std::function &callback) { + callback(); +} + +#ifdef PADDLE_WITH_CUDA +UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void UnsafeFastGPUGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void DefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void DefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} + +StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + callback_manager_.reset(new platform::StreamCallbackManager(stream_)); +} + +StreamGarbageCollector::~StreamGarbageCollector() { + auto place = boost::get(this->dev_ctx_->GetPlace()); + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +} + +cudaStream_t StreamGarbageCollector::stream() const { return stream_; } + +void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); } + +void StreamGarbageCollector::ClearCallback( + const std::function &callback) { + callback_manager_->AddCallback(callback); +} +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 1382e0d461..2768671029 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -14,160 +14,83 @@ #pragma once -#include #include #include #include #include // NOLINT -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_device_guard.h" -#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { namespace framework { -// T should have memory_size() and clear() method -template class GarbageCollector { public: - GarbageCollector(const platform::Place &place, size_t max_memory_size) - : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { - garbages_.reset(new std::deque()); - dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); - } + using GarbageQueue = std::deque>; - virtual ~GarbageCollector() {} + GarbageCollector(const platform::Place &place, size_t max_memory_size); - size_t NumOfGarbages() const { - std::lock_guard guard(mutex_); - return garbages_->size(); - } + virtual ~GarbageCollector() = default; - void Reset() { - std::lock_guard guard(mutex_); - garbages_.reset(new std::deque()); - cur_memory_size_ = 0; - } + virtual void Wait() const {} template - void Add(const Container &objs) { - Add(objs, []() {}); - } + void Add(Container &&objs); template - void Add(const Container &objs, Callback &&callback) { - std::deque *clear_deque = nullptr; - { - std::lock_guard guard(mutex_); - for (auto *obj : objs) { - garbages_->push_back(obj); - cur_memory_size_ += obj->memory_size(); - } - if (cur_memory_size_ >= max_memory_size_) { - cur_memory_size_ = 0; - clear_deque = garbages_.release(); - garbages_.reset(new std::deque()); - } - } - - if (clear_deque != nullptr) { - callback(); - ClearCallback([clear_deque]() { - for (auto *obj : *clear_deque) obj->clear(); - delete clear_deque; - }); - } - } - - virtual void Wait() const {} + void Add(Container &&objs, Callback &&callback); protected: virtual void ClearCallback(const std::function &callback) = 0; platform::DeviceContext *dev_ctx_; - std::unique_ptr> garbages_; + std::unique_ptr garbages_; mutable std::mutex mutex_; const size_t max_memory_size_; - size_t cur_memory_size_ = 0; + size_t cur_memory_size_{0}; }; -template -class CPUGarbageCollector : public GarbageCollector { +class CPUGarbageCollector : public GarbageCollector { public: - CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size); protected: - void ClearCallback(const std::function &callback) override { - callback(); - } + void ClearCallback(const std::function &callback) override; }; #ifdef PADDLE_WITH_CUDA -template -class UnsafeFastGPUGarbageCollector : public GarbageCollector { +class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + size_t max_memory_size); protected: - void ClearCallback(const std::function &callback) override { - callback(); - } + void ClearCallback(const std::function &callback) override; }; -template -class DefaultStreamGarbageCollector : public GarbageCollector { +class DefaultStreamGarbageCollector : public GarbageCollector { public: DefaultStreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + size_t max_memory_size); - cudaStream_t stream() const { - return static_cast(this->dev_ctx_) - ->stream(); - } - - void Wait() const override { - static_cast(this->dev_ctx_) - ->WaitStreamCallback(); - } + void Wait() const override; protected: - void ClearCallback(const std::function &callback) override { - static_cast(this->dev_ctx_) - ->AddStreamCallback(callback); - } + void ClearCallback(const std::function &callback) override; }; -template -class StreamGarbageCollector : public GarbageCollector { +class StreamGarbageCollector : public GarbageCollector { public: StreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) { - platform::CUDADeviceGuard guard(place.device); - PADDLE_ENFORCE(cudaStreamCreate(&stream_)); - callback_manager_.reset(new platform::StreamCallbackManager(stream_)); - } + size_t max_memory_size); - ~StreamGarbageCollector() { - auto place = boost::get(this->dev_ctx_->GetPlace()); - platform::CUDADeviceGuard guard(place.device); - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - PADDLE_ENFORCE(cudaStreamDestroy(stream_)); - } + ~StreamGarbageCollector(); - void Wait() const override { callback_manager_->Wait(); } + void Wait() const override; - cudaStream_t stream() const { return stream_; } + cudaStream_t stream() const; protected: - void ClearCallback(const std::function &callback) override { - callback_manager_->AddCallback(callback); - } + void ClearCallback(const std::function &callback) override; private: cudaStream_t stream_; @@ -175,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector { }; #endif +template +void GarbageCollector::Add(Container &&objs) { + Add(std::forward(objs), []() {}); +} + +template +void GarbageCollector::Add(Container &&objs, Callback &&callback) { + GarbageQueue *garbage_queue = nullptr; + { + std::lock_guard guard(mutex_); + for (auto &obj : objs) { + if (!obj) continue; + cur_memory_size_ += obj->size(); + garbages_->push_back(std::move(obj)); + } + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + garbage_queue = garbages_.release(); + garbages_.reset(new GarbageQueue()); + } + } + + if (garbage_queue) { + callback(); + ClearCallback([garbage_queue]() { delete garbage_queue; }); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e51b1f1f73..7458b69af8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,29 +97,31 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( if (gcs_.count(place) > 0) { continue; } - GarbageCollector *gc = nullptr; + std::unique_ptr gc; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { if (IsFastEagerDeletionModeEnabled()) { - gc = new UnsafeFastGPUGarbageCollector( - boost::get(place), max_memory_size); + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); } else { - gc = new StreamGarbageCollector( - boost::get(place), max_memory_size); + gc.reset(new StreamGarbageCollector( + boost::get(place), max_memory_size)); } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; - } else if (platform::is_cpu_place(place)) { + } else { #endif - gc = new CPUGarbageCollector( - boost::get(place), max_memory_size); - VLOG(10) << "Created GarbageCollector at " << place; + if (platform::is_cpu_place(place)) { + gc.reset(new CPUGarbageCollector(boost::get(place), + max_memory_size)); + VLOG(10) << "Created GarbageCollector at " << place; + } else { + PADDLE_THROW("Unsupported place for garbage collection"); + } #ifdef PADDLE_WITH_CUDA } #endif - if (gc) { - gcs_[place] = std::unique_ptr>(gc); - } + gcs_.emplace(place, std::move(gc)); } if (!gcs_.empty()) { @@ -144,8 +146,6 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; - - graph->SetNotOwned(details::kGarbageCollector, &gcs_); } return graph; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index cb3b6cdc3e..6fa5e99f9f 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,7 +38,7 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); -DEFINE_bool(fast_eager_deletion_mode, true, +DEFINE_bool(fast_eager_deletion_mode, false, "Fast eager deletion mode. If enabled, memory would release " "immediately without waiting GPU kernel ends."); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b..9f7027f5ae 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -158,6 +158,10 @@ class Tensor { const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } + std::shared_ptr MoveMemory() { + return std::move(holder_); + } + private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py new file mode 100644 index 0000000000..1ec174544c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -0,0 +1,49 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from test_eager_deletion_lstm_net import TestBase +import paddle.fluid as fluid + + +def gru_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=400.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) + gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) + gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class GRUTest(TestBase): + def setUp(self): + self.net = gru_net + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py new file mode 100644 index 0000000000..431765bff2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -0,0 +1,111 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['CPU_NUM'] = '2' + +import six +import unittest + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): + if use_cuda and not core.is_compiled_with_cuda(): + print('Skip use_cuda=True because Paddle is not compiled with cuda') + return + + word_dict = paddle.dataset.imdb.word_dict() + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + cost = network(data, label, len(word_dict)) + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + reader = feeder.decorate_reader( + train_reader, multi_devices=use_parallel_executor) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if use_parallel_executor: + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=cost.name) + fetch_list = [cost.name] + else: + train_exe = exe + fetch_list = [cost] + + for pass_id in six.moves.xrange(pass_num): + batch_id = 0 + for data in reader(): + train_exe.run(feed=data, + fetch_list=fetch_list if batch_id % 4 == 0 else []) + batch_id += 1 + if batch_id > 16: + break + + +def lstm_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class TestBase(unittest.TestCase): + def setUp(self): + self.net = lstm_net + + def test_network(self): + for use_cuda in [True, False]: + for use_parallel_executor in [False, True]: + print('network: {}, use_cuda: {}, use_parallel_executor: {}'. + format(self.net.__name__, use_cuda, + use_parallel_executor)) + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(core.Scope()): + train(self.net, use_cuda, use_parallel_executor) + + +if __name__ == "__main__": + unittest.main() From f1fb64b17fb0290e7e1f110069de19b0ea0d0474 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 7 Dec 2018 21:52:27 +0800 Subject: [PATCH 179/719] Add reduce sparse tensor feature. (#14757) --- paddle/fluid/framework/details/CMakeLists.txt | 16 +- .../fluid/framework/details/build_strategy.cc | 19 ++- .../fluid/framework/details/build_strategy.h | 2 + .../framework/details/reduce_and_gather.h | 2 +- .../framework/details/reduce_op_handle.cc | 144 +++++++++++++++++- .../framework/details/reduce_op_handle.h | 39 +++++ .../operators/distributed/CMakeLists.txt | 14 +- .../distributed/collective_client.cc | 59 +++++++ .../operators/distributed/collective_client.h | 93 +++++++++++ .../distributed/collective_server.cc | 74 +++++++++ .../operators/distributed/collective_server.h | 110 +++++++++++++ .../distributed/collective_server_test.cc | 115 ++++++++++++++ .../operators/distributed/grpc_client.cc | 59 ++++++- .../fluid/operators/distributed/grpc_client.h | 24 ++- .../operators/distributed/grpc_server.cc | 102 ++++++++++++- .../operators/distributed/grpc_service.h | 8 +- .../operators/distributed/request_handler.h | 2 + .../fluid/operators/distributed/rpc_client.h | 12 +- .../fluid/operators/distributed/rpc_server.cc | 90 +++++++++++ .../fluid/operators/distributed/rpc_server.h | 31 ++++ .../operators/distributed/send_recv.proto.in | 3 + paddle/fluid/operators/math/softmax_impl.h | 1 + paddle/fluid/pybind/pybind.cc | 12 ++ python/paddle/fluid/framework.py | 1 + python/paddle/fluid/parallel_executor.py | 8 + .../fluid/transpiler/distribute_transpiler.py | 1 + 26 files changed, 1013 insertions(+), 28 deletions(-) create mode 100644 paddle/fluid/operators/distributed/collective_client.cc create mode 100644 paddle/fluid/operators/distributed/collective_client.h create mode 100644 paddle/fluid/operators/distributed/collective_server.cc create mode 100644 paddle/fluid/operators/distributed/collective_server.h create mode 100644 paddle/fluid/operators/distributed/collective_server_test.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 93288936fe..2f76cb714f 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) - nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) + if(WITH_DISTRIBUTE) + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor sendrecvop_grpc) + else() + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor) + endif() nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) else() cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory variable_visitor) - cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) + if(WITH_DISTRIBUTE) + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor sendrecvop_grpc) + else() + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor) + endif() cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) endif() diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 1e1b945f63..d8526b3f24 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -58,6 +58,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } + CollectiveContext *context = CollectiveContext::GetInstance(); + context->endpoints_ = strategy_.trainers_endpoints_; + context->trainer_id_ = strategy_.trainer_id_; + PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0"); + if (strategy_.trainer_id_ > 0) { + PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) < + strategy_.trainers_endpoints_.size(), + "trainer_id_ < endpoints_ size"); + } + VLOG(1) << "CollectiveContext:" << context->String(); + // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", @@ -135,16 +146,16 @@ std::unique_ptr BuildStrategy::Apply( pass->SetNotOwned("nccl_ctxs", nctx); #endif } else if (pass->Type() == "sequential_execution_pass") { - VLOG(1) << "set enable_sequential_execution:" - << enable_sequential_execution_; + LOG(INFO) << "set enable_sequential_execution:" + << enable_sequential_execution_; pass->Erase(kAllOpDescs); pass->Set>( kAllOpDescs, new std::vector(main_program.Block(0).AllOps())); } else if (pass->Type() == "all_reduce_deps_pass") { - VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) - << ", num_trainers:" << num_trainers_; + LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) + << ", num_trainers:" << num_trainers_; pass->Erase(kAllOpDescs); pass->Set>( diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 9f0a259128..c97be16957 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -74,6 +74,8 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; int num_trainers_{1}; + int trainer_id_{0}; + std::vector trainers_endpoints_; bool remove_unnecessary_lock_{false}; // NOTE: diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index bd6153c0c7..2e5256fbd4 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -53,7 +53,7 @@ struct ReduceLoDTensor { } }; -inline void GatherSelectedRows( +inline void GatherLocalSelectedRows( const std::vector &src_selecte_rows_, const std::vector &in_places, const std::map &dev_ctxes, diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index c9f1107aea..cb864848b9 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -16,6 +16,12 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#endif +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool( @@ -26,6 +32,112 @@ namespace paddle { namespace framework { namespace details { +std::once_flag CollectiveContext::init_flag_; +std::unique_ptr CollectiveContext::context_; + +static inline std::string GetRemoteVarName(const std::string &var_name, + int trainer_id) { + return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id); +} + +void ReduceOpHandle::Wait( + const std::map &dev_ctxes) { + // TODO(gongwb): use event wait? + for (auto &dev_ctx : dev_ctxes) { + dev_ctx.second->Wait(); + } +} + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +template +void ReduceOpHandle::GatherSelectedRows( + const std::vector &src_selected_rows, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selected_rows) { + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + + // 1. gather local selected rows, merge them + std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp"; + auto scope = local_scopes_.at(out_var_handle->scope_idx_); + auto gathered_var_mid = scope->Var(gathered_var_name); + auto gathered_select_rows = + gathered_var_mid->GetMutable(); + GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place, + gathered_select_rows); + // FIXME(gongwb): remove this Wait. + Wait(dev_ctxes); + + // merge them + auto merged_dev_ctx = dynamic_cast(dev_ctxes.at(out_place)); + std::string merged_var_name = + GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_); + auto merged_select_rows = + scope->Var(merged_var_name)->GetMutable(); + operators::math::scatter::MergeAdd merge_func; + merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows); + + // 2. start collective server if it doesn't exist + operators::distributed::CollectiveServer *server = + operators::distributed::CollectiveServer::GetInstance( + collective_context.endpoints_[collective_context.trainer_id_], + collective_context.endpoints_.size() - 1); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar(merged_var_name, + operators::distributed::kRequestGetMonomerVariable, + scope, merged_dev_ctx); + + // 3. gather them from all remote nodes. + std::vector remote; + operators::distributed::CollectiveClient *client = + operators::distributed::CollectiveClient::GetInstance(); + + std::vector vars; + for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) { + if (i == (unsigned)collective_context.trainer_id_) continue; + + operators::distributed::RemoteVar var; + var.trainer_id_ = i; + var.var_name_ = GetRemoteVarName(out_var_handle->name_, i); + var.ep_ = collective_context.endpoints_[i]; + + vars.push_back(var); + VLOG(4) << "gather from:" << var.String(); + } + + // erase gathered vars + merged_dev_ctx->Wait(); + scope->EraseVars(std::vector{gathered_var_name}); + + PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); + PADDLE_ENFORCE(remote.size() == vars.size()); + + // 4. merged local selected rows. + std::vector all; + all.resize(collective_context.endpoints_.size()); + for (auto v : vars) { + all[v.trainer_id_] = + scope->FindVar(v.var_name_)->GetMutable(); + } + all[collective_context.trainer_id_] = merged_select_rows; + + merge_func(*merged_dev_ctx, all, dst_selected_rows); + + rpc_server->WaitVarBarrier(merged_var_name); + rpc_server->ClearVar(merged_var_name); + + // 5. clear mid vars + std::vector tmp_vars{merged_var_name}; + for (auto r : vars) { + tmp_vars.push_back(r.var_name_); + } + scope->EraseVars(tmp_vars); +} +#endif + void ReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); @@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() { this->RunAndRecordEvent([&] { std::vector in_selected_rows = GetInputValues(in_var_handles, var_scopes); - GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p, - out_var->GetMutable()); + + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + VLOG(10) << "GatherSelectedRows CollectiveContext:" + << collective_context.String(); + + // TODO(gongwb): add cpu support + if (collective_context.endpoints_.size() <= 1 || + is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { + GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_, + t_out_p, + out_var->GetMutable()); + return; + } + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + if (framework::IsType(in_selected_rows[0]->value().type())) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else if (framework::IsType( + in_selected_rows[0]->value().type())) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else { + PADDLE_ENFORCE(false, + "only support double or float when gahter SelectedRows"); + } +#endif }); } else { std::vector lod_tensors = diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 846839029c..5491f00f45 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -30,6 +30,32 @@ namespace paddle { namespace framework { namespace details { +struct CollectiveContext { + std::vector endpoints_; + int trainer_id_{0}; + + std::string String() const { + std::stringstream ss; + ss << "endpoints_:"; + for (auto e : endpoints_) { + ss << e << ","; + } + + ss << "trainer_id_:" << trainer_id_; + + return ss.str(); + } + + static CollectiveContext *GetInstance() { + std::call_once(init_flag_, + [&]() { context_.reset(new CollectiveContext()); }); + return context_.get(); + } + + private: + static std::once_flag init_flag_; + static std::unique_ptr context_; +}; struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; @@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase { protected: void RunImpl() override; +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + template + void GatherSelectedRows( + const std::vector &src_selecte_rows_, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selecte_rows); +#endif + + void Wait( + const std::map &dev_ctxes); + template std::vector GetInputValues( const std::vector &in_var_handles, diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 36979de68f..101dbe9c89 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -13,16 +13,26 @@ set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + DEPS lod_tensor selected_rows_functor memory) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(grpc_serde_test SRCS grpc_serde_test.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) + + if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + selected_rows_functor scope math_function SERIAL) + endif() + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) else() set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc new file mode 100644 index 0000000000..6d3f534311 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // NOLINT +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/distributed/collective_client.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { +std::once_flag CollectiveClient::init_flag_; +std::unique_ptr CollectiveClient::client_(nullptr); + +bool CollectiveClient::Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, + framework::Scope* scope, int64_t time_out) { + for (auto r : remote_vars) { + VLOG(50) << "begin gather from ep:" << r.String(); + scope->Var(r.var_name_)->GetMutable(); + VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( + r.ep_, ctx, *scope, r.var_name_, time_out); + } + + rpc_client_->Wait(); + + for (auto r : remote_vars) { + auto select_rows = + scope->FindVar(r.var_name_)->GetMutable(); + dst->push_back(select_rows); + + VLOG(4) << "gather from ep:" << r.String() + << ", select_rows:" << GetSelectedRowsInfo(*select_rows); + + rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); + } + + rpc_client_->Wait(); + return true; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h new file mode 100644 index 0000000000..53b03c531a --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { + +inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { + std::stringstream ss; + ss << ", height:" << slr.height() << ", rows:["; + for (unsigned int i = 0; i < slr.rows().size(); i++) { + if (i != slr.rows().size() - 1) { + ss << slr.rows()[i] << ","; + } else { + ss << slr.rows()[i]; + } + } + ss << "], dims:" << slr.value().dims(); + return ss.str(); +} + +struct RemoteVar { + std::string ep_; + std::string var_name_; + int trainer_id_{0}; + + std::string String() { + std::stringstream ss; + ss << "ep:" << ep_ << ", var_name:" << var_name_ + << ", trainer_id:" << trainer_id_; + + return ss.str(); + } +}; + +class CollectiveClient { + public: + CollectiveClient() { + rpc_client_.reset(new RPCCLIENT_T()); + rpc_client_->InitImpl(); + } + virtual ~CollectiveClient() {} + + // note this function will retain the rank order. + bool Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, framework::Scope* scope, + int64_t time_out = FLAGS_rpc_deadline); + + static CollectiveClient* GetInstance() { + std::call_once(init_flag_, [&]() { + if (client_.get() == nullptr) { + client_.reset(new CollectiveClient()); + } + }); + return client_.get(); + } + + private: + std::unique_ptr rpc_client_; + + static std::once_flag init_flag_; + static std::unique_ptr client_; +}; +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc new file mode 100644 index 0000000000..c95652400c --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // for removing the port file +#include +#include +#include +#include // NOLINT +#include + +#include "paddle/fluid/operators/distributed/collective_server.h" + +DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag CollectiveServer::init_flag_; +std::shared_ptr CollectiveServer::collective_server_(nullptr); + +CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { + VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; + rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); +} + +void CollectiveServer::Stop() { + rpc_server_->ShutDown(); + server_thread_->join(); + loop_thread_->join(); +} + +void CollectiveServer::StartServer() { + get_monomer_handler_.reset(new GetMonomerHandler()); + get_monomer_handler_->SetRPCServer(rpc_server_.get()); + + get_barrier_handler_.reset(new GetMonomerBarrierHandler()); + get_barrier_handler_->SetRPCServer(rpc_server_.get()); + + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, + get_monomer_handler_.get(), + FLAGS_collective_get_thread_num); + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, + get_barrier_handler_.get(), 1); + + server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); + rpc_server_->WaitServerReady(); + + loop_thread_.reset(new std::thread([&]() { + while (true) { + if (rpc_server_->IsExit()) { + LOG(WARNING) << "get exit!rpc_processor break!"; + break; + } + sleep(1); + } + VLOG(1) << "CollectiveServer loop_thread end"; + })); +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h new file mode 100644 index 0000000000..a23dc18b4d --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class CollectiveServer; + +class GetMonomerHandler final : public RequestHandler { + public: + GetMonomerHandler() : RequestHandler(true) {} + virtual ~GetMonomerHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + *outvar = scope->FindVar(var_name); + PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name); + + return true; + } +}; + +class GetMonomerBarrierHandler final : public RequestHandler { + public: + GetMonomerBarrierHandler() : RequestHandler(true) {} + virtual ~GetMonomerBarrierHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + rpc_server_->IncreaseVarBarrier(var_name); + + return true; + } +}; + +class CollectiveServer final { + public: + explicit CollectiveServer(const std::string& end_point, int fan_in); + + virtual ~CollectiveServer() {} + + void StartServer(); + + static CollectiveServer* GetInstance(const std::string& end_point, + int fan_in) { + std::call_once(init_flag_, [&]() { + if (collective_server_.get() == nullptr) { + collective_server_.reset(new CollectiveServer(end_point, fan_in)); + collective_server_->StartServer(); + } + }); + + return collective_server_.get(); + } + + std::shared_ptr GetRPCServer() { return rpc_server_; } + + void Stop(); + + private: + std::unique_ptr get_monomer_handler_; + std::unique_ptr get_barrier_handler_; + + std::shared_ptr rpc_server_; + std::shared_ptr server_thread_; + std::shared_ptr loop_thread_; + + bool ready_{false}; + + static std::once_flag init_flag_; + static std::shared_ptr collective_server_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc new file mode 100644 index 0000000000..0a9c69e393 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace distributed = paddle::operators::distributed; + +std::unique_ptr StartServer( + const std::string& ep, int fan_in, framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveServer* server = + distributed::CollectiveServer::GetInstance(ep, fan_in); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, + scope, dev_ctx); + + std::cout << "StartServer return" << std::endl; + return std::unique_ptr(server); +} + +std::unique_ptr GenerateVars(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + auto* slr = var->GetMutable(); + slr->set_height(1000); + + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + + tensor->Resize(framework::make_ddim({3, 5})); + tensor->mutable_data(place); + + paddle::operators::math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 3; ++i) rows->push_back(i); + + std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); + + return std::unique_ptr(scope); +} + +void Gather(const std::vector& vars, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveClient* client = + distributed::CollectiveClient::GetInstance(); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + var->GetMutable(); + + std::vector dst; + client->Gather(vars, &dst, *dev_ctx, scope); + std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); +} + +TEST(PREFETCH, GPU) { + platform::CUDAPlace place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + std::string ep = "127.0.0.1:7164"; + auto scope = GenerateVars(place); + + auto* v1 = scope->FindVar("var1"); + std::cout << "var1:" << v1 << std::endl; + + auto server = StartServer(ep, 2, scope.get(), &ctx); + auto rpc_server = server->GetRPCServer(); + + distributed::RemoteVar var; + var.ep_ = ep; + var.var_name_ = "var1"; + var.trainer_id_ = 0; + + std::vector vars{var}; + Gather(vars, &ctx); + Gather(vars, &ctx); + + std::cout << "begin WaitVarBarrier" << std::endl; + rpc_server->WaitVarBarrier("var1"); + rpc_server->ClearRegisteredVars(); + server->Stop(); + + scope.release(); + server.release(); +} diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index d7f3ea86af..857214aa21 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -28,11 +28,11 @@ namespace paddle { namespace operators { namespace distributed { -void GRPCClient::InitImpl() { InitEventLoop(); } - -void GRPCClient::InitEventLoop() { +void GRPCClient::InitImpl() { // start the client process thread // TODO(wuyi): can make this in a threadpool + PADDLE_ENFORCE(client_thread_ == nullptr, + "please not re init proceed thread"); client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); } @@ -106,6 +106,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { + VLOG(100) << "ProcGetResponse"; framework::Variable* outvar = nullptr; // get response's trainer_id is not used int trainer_id; @@ -126,6 +127,24 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, + "/sendrecv.SendRecvService/GetVariable", time_out); +} + +VarHandlePtr GRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, + "/sendrecv.SendRecvService/GetMonomerVariable", time_out); +} + +VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& rpc_path, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; @@ -136,7 +155,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] { + framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -151,8 +170,8 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, platform::RecordRPCEvent record_event(method, p_ctx); - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -268,6 +287,34 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, return h; } +VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + const auto ch = GetChannel(ep); + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + const std::string method = "SendMonomerFetchBarrierRPC"; + VarHandlePtr h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); + + VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; + + sendrecv::VariableMessage req; + req.set_varname(var_name); + + platform::RecordRPCEvent record_event(method, nullptr); + + auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index a31a465645..01bf46cc31 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -189,6 +189,11 @@ class GRPCClient : public RPCClient { const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, @@ -200,8 +205,12 @@ class GRPCClient : public RPCClient { VarHandlePtr AsyncSendBatchBarrier( const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) override; + + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncCheckpointNotify( const std::string& ep, const std::string& dir, @@ -214,21 +223,22 @@ class GRPCClient : public RPCClient { void SendComplete() override; - protected: void InitImpl() override; private: - // InitEventLoop should only be called by Init() - void InitEventLoop(); - void Proceed(); std::shared_ptr GetChannel(const std::string& ep); + VarHandlePtr _AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, const std::string& rpc, + int64_t time_out); private: grpc::CompletionQueue cq_; std::unordered_map> channels_; - std::unique_ptr client_thread_; + std::unique_ptr client_thread_{nullptr}; // mutex for Wait client sync std::mutex sync_mutex_; diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index d9200c98b2..c3974138f4 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -158,6 +158,98 @@ class RequestGet final : public RequestBase { ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; }; +class RequestGetMonomerVariable final : public RequestBase { + public: + explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, + int req_id, RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerVariable); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerVariable() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + if (outvar) { + SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); + } + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + RPCServer* rpc_server_{nullptr}; +}; + +class RequestGetMonomerBarrier final : public RequestBase { + public: + explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id, + RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerBarrier); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerBarrier() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + VLOG(4) << "RequestGetMonomerBarrier " << varname; + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + framework::Scope* scope = nullptr; + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VoidMessage reply_; + ServerAsyncResponseWriter responder_; + RPCServer* rpc_server_{nullptr}; +}; + class RequestPrefetch final : public RequestBase { public: explicit RequestPrefetch(GrpcService::AsyncService* service, @@ -249,7 +341,7 @@ class RequestCheckpointNotify final : public RequestBase { }; void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is wait server ready"; + VLOG(4) << "AsyncGRPCServer is waiting server ready"; std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); VLOG(4) << "AsyncGRPCServer WaitSeverReady"; @@ -368,6 +460,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, b = new RequestSend(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestGet) { b = new RequestGet(&service_, cq.get(), handler, req_id); + } else if (rpc_name == kRequestGetMonomerVariable) { + b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id, + this); + } else if (rpc_name == kRequestGetMonomerBarrier) { + b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id, + this); } else if (rpc_name == kRequestPrefetch) { b = new RequestPrefetch(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestCheckpoint) { @@ -378,7 +476,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, reqs[req_id] = b; - VLOG(4) << "Create RequestSend status:" << b->Status(); + VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); } void AsyncGRPCServer::HandleRequest( diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h index 9ae9a31a00..537429b5fe 100644 --- a/paddle/fluid/operators/distributed/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc_service.h @@ -81,10 +81,12 @@ enum class GrpcMethod { kGetVariable, kPrefetchVariable, kCheckpointNotify, + kGetMonomerVariable, + kGetMonomerBarrier, }; static const int kGrpcNumMethods = - static_cast(GrpcMethod::kCheckpointNotify) + 1; + static_cast(GrpcMethod::kGetMonomerBarrier) + 1; inline const char* GrpcMethodName(GrpcMethod id) { switch (id) { @@ -92,6 +94,10 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kGetMonomerVariable: + return "/sendrecv.SendRecvService/GetMonomerVariable"; + case GrpcMethod::kGetMonomerBarrier: + return "/sendrecv.SendRecvService/GetMonomerBarrier"; case GrpcMethod::kPrefetchVariable: return "/sendrecv.SendRecvService/PrefetchVariable"; case GrpcMethod::kCheckpointNotify: diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 5272afd428..62b24f150b 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -37,6 +37,8 @@ namespace distributed { constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; +constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; +constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 4cd3abb5a6..b668d86978 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -45,6 +45,11 @@ class RPCClient { const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncPrefetchVar( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& in_var_name, @@ -57,6 +62,10 @@ class RPCClient { virtual VarHandlePtr AsyncSendFetchBarrier( const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncCheckpointNotify( const std::string& ep, const std::string& dir, int64_t time_out = FLAGS_rpc_deadline) = 0; @@ -87,8 +96,9 @@ class RPCClient { } } - protected: virtual void InitImpl() {} + + protected: // each trainer have exact one trainer id, it should be static static int trainer_id_; diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 3e30ed4ac8..122619d41b 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -132,6 +132,96 @@ void RPCServer::WaitCond(const std::string& rpc_name) { lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); } +void RPCServer::RegisterVar(const std::string& var_name, + const std::string& rpc_name, + framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + MonomerHandle h; + h.var_name_ = var_name; + h.rpc_name_ = rpc_name; + h.scope_ = scope; + h.dev_ctx_ = dev_ctx; + + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + PADDLE_ENFORCE(false, "%s alreay in var_map", var_name); + } + var_map_[var_name] = h; + } + + rpc_cond_.notify_all(); + VLOG(4) << "RegisterVar context:" << h.String(); +} + +void RPCServer::IncreaseVarBarrier(const std::string& var_name) { + int b = 0; + MonomerHandle h; + { + std::unique_lock lock(mutex_); + b = ++var_map_[var_name].barrier_; + h = var_map_[var_name]; + } + + if (b >= client_num_) { + barrier_cond_.notify_all(); + } + + VLOG(4) << "IncreaseVarBarrier context:" << h.String(); +} + +void RPCServer::WaitVarBarrier(const std::string& var_name) { + VLOG(4) << "WaitBarrier var_name:" << var_name; + + std::unique_lock lock(mutex_); + barrier_cond_.wait(lock, [&]() { + return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || + exit_flag_.load()); + }); + + VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); +} + +void RPCServer::SetVarCond(const std::string& var_name) { + VLOG(4) << "SetVarCond var_name:" << var_name; + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + rpc_cond_.notify_all(); + } + } +} + +void RPCServer::WaitVarCond(const std::string& var_name) { + VLOG(4) << "WaitVarCond var_name:" << var_name; + + std::unique_lock lock(mutex_); + rpc_cond_.wait(lock, [=] { + return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); + }); + + VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; +} + +MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { + MonomerHandle h; + { + std::unique_lock lock(mutex_); + h = var_map_[var_name]; + } + + return h; +} + +void RPCServer::ClearRegisteredVars() { + std::unique_lock lock(mutex_); + var_map_.clear(); +} + +void RPCServer::ClearVar(const std::string& var_name) { + std::unique_lock lock(mutex_); + var_map_.erase(var_name); +} } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index c78c5007a7..45d1d3479c 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -21,12 +21,30 @@ #include #include +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { namespace distributed { +struct MonomerHandle { + std::string var_name_; + std::string rpc_name_; + framework::Scope* scope_{nullptr}; + platform::DeviceContext* dev_ctx_{nullptr}; + int64_t barrier_{0}; + + std::string String() { + std::stringstream ss; + ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ + << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ + << ", barrier_:" << barrier_; + return ss.str(); + } +}; + class RPCServer { public: explicit RPCServer(const std::string& address, int client_num) @@ -67,6 +85,16 @@ class RPCServer { void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); + void RegisterVar(const std::string& var_name, const std::string& rpc_name, + framework::Scope* scope, platform::DeviceContext* dev_ctx); + void IncreaseVarBarrier(const std::string& var_name); + void WaitVarBarrier(const std::string& var_name); + void SetVarCond(const std::string& var_name); + void WaitVarCond(const std::string& var_name); + void ClearRegisteredVars(); + void ClearVar(const std::string& var_name); + MonomerHandle GetMonomer(const std::string& var_name); + void Complete(); void ResetBarrierCounter(); @@ -95,6 +123,9 @@ class RPCServer { std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; friend class RequestHandler; + + // TODO(gongwb): use more cond to notify or wait; + std::unordered_map var_map_; }; }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index 7b7d069f17..2637619f30 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -28,6 +28,9 @@ service SendRecvService { rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} + + rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} + rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 31ed519666..9e99e44822 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ea07372a28..dca0c01ab2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -925,6 +925,18 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, int num_trainers) { self.num_trainers_ = num_trainers; }) + .def_property( + "trainers_endpoints", + [](const BuildStrategy &self) { return self.trainers_endpoints_; }, + [](BuildStrategy &self, + const std::vector &trainers_endpoints) { + self.trainers_endpoints_ = trainers_endpoints; + }) + .def_property("trainer_id", + [](const BuildStrategy &self) { return self.trainer_id_; }, + [](BuildStrategy &self, int trainer_id) { + self.trainer_id_ = trainer_id; + }) .def_property( "fuse_elewise_add_act_ops", [](const BuildStrategy &self) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 9e6345f148..1511eea68c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1483,6 +1483,7 @@ class Program(object): self._is_chief = False self._slice_vars_and_attrs = [] self._endpoints = [] + self._trainers_endpoints = [] self._distributed_lookup_table = None @property diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index dc27a8eabb..c54c3963a1 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -135,9 +135,17 @@ class ParallelExecutor(object): build_strategy = BuildStrategy() build_strategy.num_trainers = num_trainers + build_strategy.trainer_id = trainer_id main = main_program main = main if main else framework.default_main_program() + + trainers_endpoints = main._trainers_endpoints + if num_trainers > 1 and trainers_endpoints: + assert num_trainers == len( + trainers_endpoints), "num_trainers == len(end_points)" + build_strategy.trainers_endpoints = trainers_endpoints + if scope == None: scope = executor.global_scope() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 3b898af706..d21ec42dcc 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -305,6 +305,7 @@ class DistributeTranspiler(object): if self.config.mode == "nccl2": assert (isinstance(trainers, str)) + self.origin_program._trainers_endpoints = trainers.split(",") self._transpile_nccl2( trainer_id, trainers, From c049fa7cf7a449e26dcd9b044f291ce57a9bd0f2 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 7 Dec 2018 22:24:44 +0800 Subject: [PATCH 180/719] Revert "Revert "Revert "Imperative""" --- paddle/fluid/CMakeLists.txt | 1 - paddle/fluid/framework/feed_fetch_method.cc | 9 - paddle/fluid/framework/feed_fetch_method.h | 2 - paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/imperative/CMakeLists.txt | 3 - paddle/fluid/imperative/engine.cc | 53 ----- paddle/fluid/imperative/engine.h | 39 ---- paddle/fluid/imperative/layer.cc | 221 ------------------ paddle/fluid/imperative/layer.h | 102 -------- paddle/fluid/imperative/tracer.cc | 19 -- paddle/fluid/imperative/tracer.h | 128 ---------- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/imperative.cc | 36 --- paddle/fluid/pybind/imperative.h | 53 ----- paddle/fluid/pybind/pybind.cc | 39 ---- python/paddle/fluid/__init__.py | 2 - python/paddle/fluid/framework.py | 54 +---- python/paddle/fluid/imperative/__init__.py | 25 -- python/paddle/fluid/imperative/base.py | 56 ----- python/paddle/fluid/imperative/layers.py | 44 ---- python/paddle/fluid/layer_helper.py | 23 +- python/paddle/fluid/layers/nn.py | 3 +- .../fluid/tests/unittests/test_imperative.py | 52 ----- python/setup.py.in | 1 - tools/print_signatures.py | 4 - 25 files changed, 19 insertions(+), 960 deletions(-) delete mode 100644 paddle/fluid/imperative/CMakeLists.txt delete mode 100644 paddle/fluid/imperative/engine.cc delete mode 100644 paddle/fluid/imperative/engine.h delete mode 100644 paddle/fluid/imperative/layer.cc delete mode 100644 paddle/fluid/imperative/layer.h delete mode 100644 paddle/fluid/imperative/tracer.cc delete mode 100644 paddle/fluid/imperative/tracer.h delete mode 100644 paddle/fluid/pybind/imperative.cc delete mode 100644 paddle/fluid/pybind/imperative.h delete mode 100644 python/paddle/fluid/imperative/__init__.py delete mode 100644 python/paddle/fluid/imperative/base.py delete mode 100644 python/paddle/fluid/imperative/layers.py delete mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 595454e90b..6b526f0103 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) -add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 6338be75a4..3e9353f5cf 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,9 +16,7 @@ limitations under the License. */ #include #include #include "glog/logging.h" -#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -55,12 +53,5 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } -LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { - Variable* var = scope.FindVar(var_name); - PADDLE_ENFORCE(var, "%s no in scope", var_name); - PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); - return *var->GetMutable(); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 031f8e01aa..7f504bfd23 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,7 +27,5 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); -LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe2..fc91564bba 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,8 +38,9 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); + LOG(ERROR) + << "Cannot add backward operator before forward operator %s." + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt deleted file mode 100644 index 373d292b44..0000000000 --- a/paddle/fluid/imperative/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -cc_library(layer SRCS layer.cc DEPS proto_desc operator) -cc_library(tracer SRCS tracer.cc DEPS proto_desc) -cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc deleted file mode 100644 index de7ab0e591..0000000000 --- a/paddle/fluid/imperative/engine.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/engine.h" - -#include // NOLINT -#include - -#include "glog/logging.h" - -namespace paddle { -namespace imperative { - -static std::once_flag init_engine; -static Engine* engine; - -class DummyEngine : public Engine { - public: - void Enqueue(Runnable* runnable) override { - queued_runnables_.push_back(runnable); - } - - size_t Size() const override { return queued_runnables_.size(); } - - void Sync() override { - for (Runnable* l : queued_runnables_) { - LOG(INFO) << "running " << reinterpret_cast(l); - } - queued_runnables_.clear(); - } - - private: - std::vector queued_runnables_; -}; - -Engine* GetEngine() { - std::call_once(init_engine, []() { engine = new DummyEngine(); }); - return engine; -} - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h deleted file mode 100644 index a1dfa5bda3..0000000000 --- a/paddle/fluid/imperative/engine.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -namespace paddle { -namespace imperative { - -struct Runnable {}; - -class Engine { - public: - virtual ~Engine() {} - - virtual void Enqueue(Runnable* runnable) = 0; - - virtual size_t Size() const = 0; - - virtual void Sync() = 0; -}; - -Engine* GetEngine(); - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc deleted file mode 100644 index 6125037680..0000000000 --- a/paddle/fluid/imperative/layer.cc +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/layer.h" -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/string/printf.h" - -namespace paddle { -namespace imperative { - -using framework::Variable; - -void AddTo(Variable* src, Variable* dst) { - framework::LoDTensor* dst_tensor = dst->GetMutable(); - framework::LoDTensor* src_tensor = src->GetMutable(); - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", - dst_tensor->numel(), src_tensor->numel()); - float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); - const float* src_data = src_tensor->data(); - for (size_t i = 0; i < src_tensor->numel(); ++i) { - dst_data[i] += src_data[i]; - } -} - -class Autograd { - public: - explicit Autograd(framework::Scope* scope) : scope_(scope) {} - - void RunBackward(VarBase* var) { - PADDLE_ENFORCE(var->pre_op_->op_desc_); - // TODO(panyx0718): Only create for vars that "require_grad" - (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; - - std::deque ready; - ready.push_back(var->pre_op_); - - std::map dep_counts = ComputeDepCounts(var->pre_op_); - - while (!ready.empty()) { - OpBase* ready_op = ready.front(); - ready.pop_front(); - std::vector input_grads = ready_op->ApplyGrad(scope_); - - for (size_t i = 0; i < input_grads.size(); ++i) { - if (!input_grads[i]) continue; - OpBase* pre_op = ready_op->pre_ops_->at(i); - if (!pre_op) continue; - - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); - } - } - } - } - - private: - std::map ComputeDepCounts(OpBase* op) { - std::map ret; - - std::deque queue; - queue.push_back(op); - std::unordered_set visited; - visited.insert(op); - while (!queue.empty()) { - OpBase* candidate = queue.front(); - queue.pop_front(); - for (OpBase* pre_op : *(candidate->pre_ops_)) { - if (!pre_op) continue; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); - } - ret[pre_op] += 1; - } - } - - return ret; - } - - framework::Scope* scope_; -}; - -framework::Variable* CreateVariable(const std::string& name, - const framework::DDim& dim, float val, - framework::Scope* scope, - bool random_name = true) { - std::string varname = name; - if (random_name) { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); - int id = dist6(rng); - varname = string::Sprintf("%s@%d", varname, id); - } - - VLOG(3) << "creating var " << varname; - framework::Variable* var = scope->Var(varname); - framework::LoDTensor* tensor = var->GetMutable(); - - float* data = tensor->mutable_data(dim, platform::CPUPlace()); - std::fill(data, data + tensor->numel(), val); - return var; -} - -framework::LoDTensor& VarBase::Grad() { - VLOG(3) << "get var grad " << var_desc_->Name(); - return *grads_->GetMutable(); -} - -void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { - VLOG(3) << "apply var grad " << var_desc_->Name() << " " - << grad->Get().data()[0]; - if (!grads_) { - grads_ = - CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), - var_->Get().dims(), 0.0, scope); - } - AddTo(grad, grads_); - VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " - << grads_->Get().data()[0]; -} - -std::vector OpBase::ApplyGrad(framework::Scope* scope) { - VLOG(3) << "op grad " << grad_op_desc_->Type(); - - for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { - if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { - // grad op inputs can be forward inputs, so not in grad_to_var. - continue; - } - VLOG(3) << "op grad in var " << grad_invar; - block_->FindRecursiveOrCreateVar(grad_invar); - framework::Variable* var = scope->Var(grad_invar); - const std::string& invar = grad_to_var_->at(grad_invar); - for (VarBase* varbase : *output_vars_) { - // Use the accumulated grads_ by sharing the input with grads_. - if (varbase->var_desc_->Name() == invar) { - var->GetMutable()->ShareDataWith( - varbase->grads_->Get()); - break; - } - } - } - - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - VLOG(3) << "grad outvar " << outvar; - block_->FindRecursiveOrCreateVar(outvar); - framework::Variable* var = scope->Var(outvar); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(outvar); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - } - grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - - opbase->Run(*scope, platform::CPUPlace()); - - // `ret` matches exactly with `input_vars_` of forward op. - std::vector ret; - for (size_t i = 0; i < input_vars_->size(); ++i) { - bool found = false; - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; - std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); - VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; - origin_var->ApplyGrad(scope, var); - found = true; - ret.push_back(var); - // TODO(panyx0718): There might be another outvar with the same name. - // In that case, it doesn't matter the first one or the second one is - // used. - break; - } - if (!found) { - ret.push_back(nullptr); - } - } - return ret; -} - -void VarBase::RunBackward(framework::Scope* scope) { - grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), - var_->Get().dims(), 1.0, scope, - false); - if (!pre_op_) return; - Autograd(scope).RunBackward(this); -} - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h deleted file mode 100644 index 85a71ca83d..0000000000 --- a/paddle/fluid/imperative/layer.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace imperative { - -class OpBase; - -class VarBase { - public: - VarBase() - : pre_op_(nullptr), - pre_op_out_idx_(-1), - var_desc_(nullptr), - var_(nullptr), - grads_(nullptr) {} - - virtual ~VarBase() {} - - void ApplyGrad(framework::Scope* scope, framework::Variable* grad); - - void RunBackward(framework::Scope* scope); - - framework::LoDTensor& Grad(); - - OpBase* pre_op_; - int pre_op_out_idx_; - - framework::VarDesc* var_desc_; - framework::Variable* var_; - framework::Variable* grads_; -}; - -class OpBase { - public: - OpBase() - : input_vars_(new std::vector()), - output_vars_(new std::vector()), - pre_ops_(new std::vector()), - pre_ops_out_idx_(new std::vector()), - op_desc_(nullptr), - grad_op_desc_(nullptr) {} - - virtual ~OpBase() { - delete input_vars_; - delete output_vars_; - - delete pre_ops_; - delete pre_ops_out_idx_; - - if (grad_op_desc_) delete grad_op_desc_; - if (grad_to_var_) delete grad_to_var_; - } - - std::vector ApplyGrad(framework::Scope* scope); - - std::vector* input_vars_; - std::vector* output_vars_; - std::vector* pre_ops_; - std::vector* pre_ops_out_idx_; - framework::OpDesc* op_desc_; - - framework::OpDesc* grad_op_desc_; - std::unordered_map* grad_to_var_; - framework::BlockDesc* block_; -}; - -class Layer { - public: - virtual ~Layer() {} - - virtual std::vector Forward(const std::vector& inputs) { - std::vector vars; - return vars; - } - - virtual void Backward() { LOG(ERROR) << "To support customize"; } -}; - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc deleted file mode 100644 index f64f9e72c4..0000000000 --- a/paddle/fluid/imperative/tracer.cc +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/imperative/tracer.h" - -namespace paddle { -namespace imperative {} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h deleted file mode 100644 index 433d07c0e5..0000000000 --- a/paddle/fluid/imperative/tracer.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/imperative/engine.h" -#include "paddle/fluid/imperative/layer.h" - -namespace paddle { -namespace imperative { - -void CreateGradOp(const framework::OpDesc& op_desc, - const std::unordered_set& no_grad_set, - const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, - std::unordered_map* grad_to_var) { - std::vector> grad_op_descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); - // TODO(panyx0718): Leak? - *grad_op_desc = grad_op_descs[0].release(); -} - -class Tracer { - public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { - root_scope_ = new framework::Scope(); - scopes_[root_block_] = root_scope_; - } - - virtual ~Tracer() { delete root_scope_; } - - void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs, - framework::BlockDesc* block) { - framework::Scope* scope = GetScope(block); - framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type(); - op_desc->InferShape(*block); - op_desc->InferVarType(block); - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(*op_desc); - - *op->input_vars_ = inputs; - for (VarBase* input : inputs) { - const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - input->var_ = var; - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - if (input->pre_op_) { - op->pre_ops_->push_back(input->pre_op_); - op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); - } else { - op->pre_ops_->push_back(nullptr); - } - } - - *op->output_vars_ = outputs; - for (size_t i = 0; i < outputs.size(); ++i) { - const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - outputs[i]->var_ = var; - outputs[i]->pre_op_ = op; - outputs[i]->pre_op_out_idx_ = i; - } - op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; - op->block_ = block; - } - - framework::Scope* GetScope(framework::BlockDesc* block) { - if (scopes_.find(block) != scopes_.end()) { - return scopes_.at(block); - } - framework::BlockDesc* parent_block = block->ParentBlock(); - PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); - framework::Scope* scope = &scopes_[parent_block]->NewScope(); - scopes_[block] = scope; - return scope; - } - - private: - std::map scopes_; - framework::BlockDesc* root_block_; - framework::Scope* root_scope_; -}; - -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb126..d602613fc8 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,7 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) - +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc deleted file mode 100644 index 34e9c897d9..0000000000 --- a/paddle/fluid/pybind/imperative.cc +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/pybind/imperative.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/imperative/tracer.h" - -namespace paddle { -namespace pybind { - -// Bind Methods -void BindTracer(pybind11::module *m) { - pybind11::class_(*m, "Tracer", "") - .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); - }) - .def("trace", &imperative::Tracer::Trace) - .def("get_scope", &imperative::Tracer::GetScope, - pybind11::return_value_policy::reference); -} - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h deleted file mode 100644 index 7a9d3a01ea..0000000000 --- a/paddle/fluid/pybind/imperative.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include "paddle/fluid/imperative/layer.h" -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -namespace paddle { -namespace pybind { - -class PyLayer : public imperative::Layer { - public: - using imperative::Layer::Layer; // Inherit constructors - - std::vector Forward( - const std::vector& inputs) override { - PYBIND11_OVERLOAD(std::vector, Layer, Forward, - inputs); // NOLINT - } - - void Backward() override { - PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT - } -}; - -class PyOpBase : public imperative::OpBase { - public: - using imperative::OpBase::OpBase; // Inherit constructors -}; - -class PyVarBase : public imperative::VarBase { - public: - using imperative::VarBase::VarBase; // Inherit constructors -}; - -void BindTracer(pybind11::module* m); - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index dca0c01ab2..58ef3da0b2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,7 +34,6 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" -#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -46,7 +45,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -102,42 +100,6 @@ PYBIND11_MODULE(core, m) { BindException(&m); - py::class_(m, "VarBase", R"DOC()DOC") - .def(py::init<>()) - .def("_run_backward", - [](imperative::VarBase &self, framework::Scope *scope) { - self.RunBackward(scope); - }) - .def("_grad", &imperative::VarBase::Grad) - .def_property( - "desc", - [](const imperative::VarBase &self) { return self.var_desc_; }, - [](imperative::VarBase &self, framework::VarDesc *var_desc) { - self.var_desc_ = var_desc; - }, - py::return_value_policy::reference); - - py::class_(m, "OpBase", R"DOC()DOC") - .def(py::init<>()) - .def_property( - "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, - [](imperative::OpBase &self, framework::OpDesc *op_desc) { - if (op_desc) { - self.op_desc_ = op_desc; - } - }, - py::return_value_policy::reference); - - py::class_ layer(m, "Layer"); - layer.def(py::init<>()) - .def("forward", - [](imperative::Layer &self, - const std::vector &inputs) { - return self.Forward(inputs); - }) - .def("backward", &imperative::Layer::Backward); - BindTracer(&m); - py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -639,7 +601,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); - m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 52417a1eaf..2a53519188 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,7 +34,6 @@ from . import io from . import evaluator from . import initializer from . import layers -from . import imperative from . import contrib from . import nets from . import optimizer @@ -68,7 +67,6 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', - 'imperative', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 1511eea68c..a40826168d 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,7 +18,6 @@ import collections import contextlib import re import six -import sys import numpy as np @@ -50,16 +49,6 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() -_imperative_tracer_ = None - - -def _in_imperative_mode(): - return _imperative_tracer_ is not None - - -def _imperative_tracer(): - return _imperative_tracer_ - class NameScope(object): def __init__(self, name="", parent=None): @@ -213,7 +202,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(core.VarBase): +class Variable(object): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -277,7 +266,6 @@ class Variable(core.VarBase): stop_gradient=False, is_data=False, **kwargs): - core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -358,18 +346,6 @@ class Variable(core.VarBase): self.stop_gradient = stop_gradient self.is_data = is_data - def _numpy(self): - scope = _imperative_tracer().get_scope(self.block.desc) - tensor = core.get_variable_tensor(scope, self.desc.name()) - return np.array(tensor) - - def _backward(self): - scope = _imperative_tracer().get_scope(self.block.desc) - self._run_backward(scope) - - def _gradient(self): - return np.array(self._grad()) - def __str__(self): return self.to_string(True) @@ -516,7 +492,7 @@ class OpProtoHolder(object): } -class Operator(core.OpBase): +class Operator(object): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -572,7 +548,6 @@ class Operator(core.OpBase): inputs=None, outputs=None, attrs=None): - core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -612,7 +587,6 @@ class Operator(core.OpBase): return True return False - self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -639,13 +613,6 @@ class Operator(core.OpBase): else: self.desc.set_input(in_proto.name, []) - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - - self.outputs = [] if outputs is not None: given = set() need = set() @@ -674,12 +641,6 @@ class Operator(core.OpBase): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) - if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -1245,8 +1206,6 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) - if _in_imperative_mode(): - _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -2251,12 +2210,3 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) - - -@contextlib.contextmanager -def _imperative_guard(tracer): - global _imperative_tracer_ - tmp_trace = _imperative_tracer_ - _imperative_tracer_ = tracer - yield - _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py deleted file mode 100644 index 922308b6b1..0000000000 --- a/python/paddle/fluid/imperative/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -from . import base -from .base import * - -from . import layers -from .layers import * - -__all__ = [] -__all__ += layers.__all__ -__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py deleted file mode 100644 index 15d38ddb56..0000000000 --- a/python/paddle/fluid/imperative/base.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import numpy as np - -from paddle.fluid import core -from paddle.fluid import framework - -__all__ = ['enabled', 'guard', 'to_variable'] - - -def enabled(): - return framework._in_imperative_mode() - - -@contextlib.contextmanager -def guard(): - train = framework.Program() - startup = framework.Program() - tracer = core.Tracer(train.current_block().desc) - with framework.program_guard(train, startup): - with framework.unique_name.guard(): - with framework._imperative_guard(tracer): - yield - - -def to_variable(value, block=None): - if isinstance(value, np.ndarray): - if not block: - block = framework.default_main_program().current_block() - py_var = framework.Variable( - block, - type=core.VarDesc.VarType.LOD_TENSOR, - name=None, - shape=value.shape, - dtype=value.dtype) - scope = framework._imperative_tracer().get_scope(block.desc) - var = scope.var(py_var.name) - tensor = var.get_tensor() - tensor.set(value, core.CPUPlace()) - return py_var - elif isinstance(value, framework.Variable): - return value - else: - raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py deleted file mode 100644 index 1a28f7f4ae..0000000000 --- a/python/paddle/fluid/imperative/layers.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import sys -import numpy as np - -from paddle.fluid import core -from paddle.fluid import framework -from paddle.fluid.imperative import base - -__all__ = ['PyLayer'] - - -class PyLayer(core.Layer): - def __init__(self): - pass - - def __call__(self, inputs): - # TODO(panyx0718): Support declarative mode as well. - assert base.enabled() - if not isinstance(inputs, list) and not isinstance(inputs, tuple): - inputs = [inputs] - - var_inputs = [] - for x in inputs: - py_var = base.to_variable(x) - var_inputs.append(py_var) - outputs = self.forward(var_inputs) - return outputs - - def forward(self, inputs): - return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 74b4a977db..dc317de9ab 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,13 +17,10 @@ from __future__ import print_function import copy import itertools import six -import sys -import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier -from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -49,21 +46,23 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() - def to_variable(self, x): - return base.to_variable(x, self.main_program.current_block()) - def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - ret = [] - if isinstance(inputs, list) or isinstance(inputs, tuple): - for inp in inputs: - ret.append(self.to_variable(inp)) + type_error = TypeError( + "Input of {0} layer should be Variable or sequence of Variable". + format(self.layer_type)) + if isinstance(inputs, Variable): + inputs = [inputs] + elif not isinstance(inputs, list) and not isinstance(inputs, tuple): + raise type_error else: - ret.append(self.to_variable(inputs)) - return ret + for each in inputs: + if not isinstance(each, Variable): + raise type_error + return inputs def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fac7538a6a..4833212d31 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6623,8 +6623,7 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) + helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py deleted file mode 100644 index b5b6305155..0000000000 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import sys -import numpy as np - -import paddle.fluid as fluid -from paddle.fluid import core - - -class MyLayer(fluid.imperative.PyLayer): - def __init__(self): - super(MyLayer, self).__init__() - - def forward(self, inputs): - x = fluid.layers.relu(inputs[0]) - self._x_for_debug = x - return [fluid.layers.elementwise_mul(x, x)] - - -class TestImperative(unittest.TestCase): - def test_layer(self): - with fluid.imperative.guard(): - cl = core.Layer() - cl.forward([]) - l = fluid.imperative.PyLayer() - l.forward([]) - - def test_layer_in_out(self): - with fluid.imperative.guard(): - l = MyLayer() - x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] - self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x._numpy())) - x._backward() - sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 0eb69cdb5c..5aee26b638 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,7 +101,6 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', - 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 7e61dde0a4..5c5266f904 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -27,8 +27,6 @@ import pydoc member_dict = collections.OrderedDict() -experimental_namespace = {"paddle.fluid.imperative"} - def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -53,8 +51,6 @@ def visit_member(parent_name, member): def visit_all_module(mod): - if (mod.__name__ in experimental_namespace): - return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) From 2c6159a151d573ca697e2dfd591720cc854b4b9b Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 7 Dec 2018 13:59:36 +0000 Subject: [PATCH 181/719] fix unittest fix cmake test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 +- .../test_eager_deletion_dynamic_rnn_base.py | 86 +++++++++++++++++++ .../unittests/test_eager_deletion_gru_net.py | 2 +- .../unittests/test_eager_deletion_lstm_net.py | 67 +-------------- 4 files changed, 92 insertions(+), 67 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index f2361c5cea..b236eef3ce 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -171,9 +171,9 @@ if(WITH_DISTRIBUTE) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper garbage_collector) else(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper garbage_collector) endif(NOT WIN32) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py new file mode 100644 index 0000000000..e91cfe0b45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['CPU_NUM'] = '2' + +import six +import unittest + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): + if use_cuda and not core.is_compiled_with_cuda(): + print('Skip use_cuda=True because Paddle is not compiled with cuda') + return + + word_dict = paddle.dataset.imdb.word_dict() + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + cost = network(data, label, len(word_dict)) + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + reader = feeder.decorate_reader( + train_reader, multi_devices=use_parallel_executor) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if use_parallel_executor: + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=cost.name) + fetch_list = [cost.name] + else: + train_exe = exe + fetch_list = [cost] + + for pass_id in six.moves.xrange(pass_num): + batch_id = 0 + for data in reader(): + train_exe.run(feed=data, + fetch_list=fetch_list if batch_id % 4 == 0 else []) + batch_id += 1 + if batch_id > 16: + break + + +class TestBase(unittest.TestCase): + def setUp(self): + self.net = None + + def test_network(self): + if self.net is None: + return + + for use_cuda in [True, False]: + for use_parallel_executor in [False, True]: + print('network: {}, use_cuda: {}, use_parallel_executor: {}'. + format(self.net.__name__, use_cuda, + use_parallel_executor)) + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(core.Scope()): + train(self.net, use_cuda, use_parallel_executor) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py index 1ec174544c..5ed3d9fdf3 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -13,7 +13,7 @@ # limitations under the License. import unittest -from test_eager_deletion_lstm_net import TestBase +from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py index 431765bff2..8462c06aa5 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -12,60 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' -os.environ['CPU_NUM'] = '2' - -import six -import unittest - -import paddle -import paddle.fluid.core as core +from test_eager_deletion_dynamic_rnn_base import TestBase import paddle.fluid as fluid - - -def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): - if use_cuda and not core.is_compiled_with_cuda(): - print('Skip use_cuda=True because Paddle is not compiled with cuda') - return - - word_dict = paddle.dataset.imdb.word_dict() - train_reader = paddle.batch( - paddle.dataset.imdb.train(word_dict), batch_size=batch_size) - - data = fluid.layers.data( - name="words", shape=[1], dtype="int64", lod_level=1) - - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - - cost = network(data, label, len(word_dict)) - optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) - optimizer.minimize(cost) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - feeder = fluid.DataFeeder(feed_list=[data, label], place=place) - reader = feeder.decorate_reader( - train_reader, multi_devices=use_parallel_executor) - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - if use_parallel_executor: - train_exe = fluid.ParallelExecutor( - use_cuda=use_cuda, loss_name=cost.name) - fetch_list = [cost.name] - else: - train_exe = exe - fetch_list = [cost] - - for pass_id in six.moves.xrange(pass_num): - batch_id = 0 - for data in reader(): - train_exe.run(feed=data, - fetch_list=fetch_list if batch_id % 4 == 0 else []) - batch_id += 1 - if batch_id > 16: - break +import unittest def lstm_net(data, @@ -92,20 +41,10 @@ def lstm_net(data, return avg_cost -class TestBase(unittest.TestCase): +class LSTMTest(TestBase): def setUp(self): self.net = lstm_net - def test_network(self): - for use_cuda in [True, False]: - for use_parallel_executor in [False, True]: - print('network: {}, use_cuda: {}, use_parallel_executor: {}'. - format(self.net.__name__, use_cuda, - use_parallel_executor)) - with fluid.program_guard(fluid.Program(), fluid.Program()): - with fluid.scope_guard(core.Scope()): - train(self.net, use_cuda, use_parallel_executor) - if __name__ == "__main__": unittest.main() From bfde5e10ce7a8ee1d95978033015eb6a5103d52c Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 7 Dec 2018 14:01:11 -0800 Subject: [PATCH 182/719] Move ngraph compile control to cmake test=develop --- paddle/fluid/framework/CMakeLists.txt | 24 ++++++++++++++--------- paddle/fluid/framework/executor.cc | 5 ++++- paddle/fluid/framework/ngraph_bridge.cc | 2 -- paddle/fluid/framework/ngraph_bridge.h | 3 --- paddle/fluid/framework/ngraph_operator.cc | 2 -- paddle/fluid/framework/ngraph_operator.h | 3 --- 6 files changed, 19 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e4c471d86b..ce429fefa7 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -129,11 +129,13 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -if(NOT WIN32) -cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) -cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) -endif(NOT WIN32) +if(WITH_NGRAPH) + if(NOT WIN32) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) + cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler ngraph) + endif(NOT WIN32) +endif(WITH_NGRAPH) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -169,11 +171,15 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) - else(NOT WIN32) + if(WITH_NGRAPH) + if(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper) + else(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + endif(NOT WIN32) + else(WITH_NGRAPH) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(NOT WIN32) + endif(WITH_NGRAPH) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 73cec21e20..09f08e1f20 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -26,6 +25,10 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_NGRAPH +#include "paddle/fluid/framework/ngraph_operator.h" +#endif + DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index e22c290377..907c95bddd 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #include #include #include @@ -118,4 +117,3 @@ void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h index 9ed6b95109..5ad7b8daeb 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_NGRAPH - #include #include #include @@ -53,4 +51,3 @@ class NgraphBridge { } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 3fea753f06..e3cc6ee994 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #include #include @@ -548,4 +547,3 @@ void NgraphOperator::Run(const Scope& scope, } // NgraphOperator::RunImpl } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 3ca023e111..85d015bd46 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_NGRAPH - #include #include #include @@ -64,4 +62,3 @@ class FusedOperator : public OperatorBase { }; } // namespace framework } // namespace paddle -#endif From 22ac2133e4d302d9d843c9d1c2b2d554d83c1e2e Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 7 Dec 2018 14:22:15 -0800 Subject: [PATCH 183/719] Rename class test=develop --- paddle/fluid/framework/executor.cc | 8 ++-- paddle/fluid/framework/ngraph_operator.cc | 57 +++++++++++------------ paddle/fluid/framework/ngraph_operator.h | 6 +-- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 09f08e1f20..e97cf44c75 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -91,11 +91,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, static void EnableFusedOp(ExecutorPrepareContext* ctx) { #ifdef PADDLE_WITH_NGRAPH VLOG(3) << "use_ngraph=True"; - auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_); + auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_); for (auto& interval : intervals) { - auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_, - interval.at(0), interval.at(1)); - *interval[0] = std::unique_ptr(fused_op); + auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0), + interval.at(1)); + *interval[0] = std::unique_ptr(ng_op); } for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { ctx->ops_.erase(it->at(0) + 1, it->at(1)); diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e3cc6ee994..253de4c611 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -57,16 +57,16 @@ typedef enum { /* nGraph support state on ops */ } op_state; // perform graph build through bridge and execute computation -class NgraphOperator { +class NgraphEngine { public: - explicit NgraphOperator(const Scope& scope, const platform::Place& place, - const std::vector>& ops, - const std::unordered_map< - std::string, ngraph::element::Type>& var_type_map, - const std::unordered_set& persist, - const std::unordered_set& fetches, - const std::unordered_set& post_op_inputs, - op_state ng_op_state) + explicit NgraphEngine(const Scope& scope, const platform::Place& place, + const std::vector>& ops, + const std::unordered_map< + std::string, ngraph::element::Type>& var_type_map, + const std::unordered_set& persist, + const std::unordered_set& fetches, + const std::unordered_set& post_op_inputs, + op_state ng_op_state) : scope_(scope), place_(place), fused_ops_(ops), @@ -131,7 +131,7 @@ class NgraphOperator { }; std::vector>::iterator>> -FusedOperator::FusedOpIntervals( +NgraphOperator::NgraphOpIntervals( std::vector>* ops) { std::vector>::iterator>> intervals; @@ -184,7 +184,7 @@ FusedOperator::FusedOpIntervals( return intervals; } -FusedOperator::FusedOperator( +NgraphOperator::NgraphOperator( const ProgramDesc& prog, size_t block_id, std::vector>::iterator start, std::vector>::iterator end, @@ -214,7 +214,7 @@ FusedOperator::FusedOperator( Process(); } -void FusedOperator::Process() { +void NgraphOperator::Process() { auto& bdesc = pdesc_.Block(block_); for (auto& var : bdesc.AllVars()) { if (!(var->GetType() == proto::VarType::SELECTED_ROWS || @@ -250,8 +250,8 @@ void FusedOperator::Process() { } } -void FusedOperator::RunImpl(const Scope& scope, - const platform::Place& place) const { +void NgraphOperator::RunImpl(const Scope& scope, + const platform::Place& place) const { op_state ng_op_state = PARTIAL_TEST; auto& bdesc = pdesc_.Block(block_); for (auto* op : bdesc.AllOps()) { @@ -265,19 +265,19 @@ void FusedOperator::RunImpl(const Scope& scope, ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; } - NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_, - persistables_, fetches_, post_op_inputs_, - ng_op_state); - ngraph_op.Run(scope, place); + NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, + persistables_, fetches_, post_op_inputs_, + ng_op_state); + ngraph_engine.Run(scope, place); } std::unordered_map> - NgraphOperator::func_cache_ = {}; + NgraphEngine::func_cache_ = {}; -std::shared_ptr NgraphOperator::backend_ = +std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); -void NgraphOperator::GetNgInputShape(std::shared_ptr op) { +void NgraphEngine::GetNgInputShape(std::shared_ptr op) { op->RuntimeInferShape(scope_, place_); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { @@ -300,7 +300,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr op) { } } -void NgraphOperator::BuildNgNodes() { +void NgraphEngine::BuildNgNodes() { for (auto& var_name : var_out_) { if (var_node_map_->find(var_name) == var_node_map_->end()) { auto* var = scope_.FindVar(var_name); @@ -322,7 +322,7 @@ void NgraphOperator::BuildNgNodes() { } } -void NgraphOperator::BuildNgIO() { +void NgraphEngine::BuildNgIO() { std::unordered_set inputs; std::unordered_set outputs; @@ -394,7 +394,7 @@ void NgraphOperator::BuildNgIO() { } } -void NgraphOperator::BuildNgFunction() { +void NgraphEngine::BuildNgFunction() { BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; @@ -415,7 +415,7 @@ void NgraphOperator::BuildNgFunction() { std::make_shared(func_outputs, func_inputs); } -std::shared_ptr NgraphOperator::GetCacheKey() { +std::shared_ptr NgraphEngine::GetCacheKey() { auto cache_key = std::make_shared(""); *cache_key += std::to_string(fused_ops_.size()); for (auto& op : fused_ops_) { @@ -443,7 +443,7 @@ std::shared_ptr NgraphOperator::GetCacheKey() { return cache_key; } -void NgraphOperator::GetNgFunction() { +void NgraphEngine::GetNgFunction() { bool cache_on = true; if (cache_on) { std::string cache_key_val = *GetCacheKey(); @@ -458,8 +458,7 @@ void NgraphOperator::GetNgFunction() { } } -void NgraphOperator::Run(const Scope& scope, - const platform::Place& place) const { +void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { std::vector> t_in; std::vector> t_out; @@ -544,6 +543,6 @@ void NgraphOperator::Run(const Scope& scope, } backend_->call(ngraph_function_, t_out, t_in); -} // NgraphOperator::RunImpl +} // NgraphEngine::RunImpl } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 85d015bd46..ede80f44be 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -32,14 +32,14 @@ limitations under the License. */ namespace paddle { namespace framework { -class FusedOperator : public OperatorBase { +class NgraphOperator : public OperatorBase { public: static std::vector< std::vector>::iterator>> - FusedOpIntervals( + NgraphOpIntervals( std::vector>* ops); - explicit FusedOperator( + explicit NgraphOperator( const ProgramDesc& prog, size_t block_id, std::vector>::iterator start, std::vector>::iterator end, From fddbd87c0a68861dc28ed691a2044c86a08af6fa Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 7 Dec 2018 14:25:04 -0800 Subject: [PATCH 184/719] Rename argument test=develop --- paddle/fluid/framework/ngraph_bridge.cc | 27 +++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 907c95bddd..a5acfd7044 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -26,14 +26,15 @@ namespace paddle { namespace framework { static std::shared_ptr GetNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, const VariableNameMap& var_map, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = var_map.at(prm); + auto& var_names = var_map.at(name); PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s prm %s expects one associated var", op->Type(), prm); + "op %s name %s expects one associated var", op->Type(), + name); if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { return (*ngb_node_map)[var_names[0]]; } else { @@ -42,42 +43,42 @@ static std::shared_ptr GetNode( } static std::shared_ptr GetInputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Inputs(), ngb_node_map); + return GetNode(op, name, op->Inputs(), ngb_node_map); } static std::shared_ptr GetOutputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Outputs(), ngb_node_map); + return GetNode(op, name, op->Outputs(), ngb_node_map); } static void SetOutputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr node, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = op->Outputs().at(prm); + auto& var_names = op->Outputs().at(name); if (var_names.size() == 1) { (*ngb_node_map)[var_names[0]] = node; } else if (var_names.size() == 0) { (*ngb_node_map)[""] = node; } else { - PADDLE_THROW("prm %s has more than 1 var_names.", prm); + PADDLE_THROW("name %s has more than 1 var_names.", name); } } static bool HasOutput(const std::shared_ptr& op, - const std::string prm) { + const std::string name) { auto& outputs = op->Outputs(); - if (outputs.find(prm) == outputs.end()) return false; - return outputs.at(prm).size() > 0; + if (outputs.find(name) == outputs.end()) return false; + return outputs.at(name).size() > 0; } template From 943ad4781f5c96573e9615668c1cc73dcb378356 Mon Sep 17 00:00:00 2001 From: bingyanghuang <33643817+bingyanghuang@users.noreply.github.com> Date: Sat, 8 Dec 2018 10:54:33 +0800 Subject: [PATCH 185/719] One possible solution to add flexibility for mkldnn placement pass (#14768) * Choose to turn on use_mkldnn attribute v1 * Fix mkldnn_op empty bug * format change test=develop * fix ci test=develop * fix ci test and add test in dam test=develop * add example to dam compare test test=develop * review changes test=develop --- paddle/fluid/framework/ir/mkldnn_placement_pass.cc | 14 +++++++++++--- paddle/fluid/inference/analysis/argument.h | 4 ++++ paddle/fluid/inference/analysis/ir_pass_manager.cc | 5 +++++ paddle/fluid/inference/api/analysis_config.cc | 8 ++++++++ paddle/fluid/inference/api/analysis_predictor.cc | 4 ++++ .../fluid/inference/api/paddle_analysis_config.h | 5 +++++ .../inference/tests/api/analyzer_dam_tester.cc | 4 ++++ 7 files changed, 41 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 1cf1315d3d..9a9314161b 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/mkldnn_placement_pass.h" +#include namespace paddle { namespace framework { @@ -21,9 +22,16 @@ namespace ir { std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { VLOG(3) << "Aplies MKL-DNN placement strategy."; + const auto& op_types_list = + Get>("mkldnn_enabled_op_types"); for (const Node* n : graph->Nodes()) { if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { - n->Op()->SetAttr("use_mkldnn", true); + if (op_types_list.empty()) { + n->Op()->SetAttr("use_mkldnn", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + n->Op()->SetAttr("use_mkldnn", true); + } } } return graph; @@ -33,5 +41,5 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( } // namespace framework } // namespace paddle -REGISTER_PASS(mkldnn_placement_pass, - paddle::framework::ir::MKLDNNPlacementPass); +REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass) + .RequirePassAttr("mkldnn_enabled_op_types"); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 53cc7039f2..83d411eecf 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -116,6 +116,10 @@ struct Argument { DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, std::vector); + // Pass a set of op types to enable its mkldnn kernel + DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, + std::unordered_set); + DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index fce5e1cac9..51bca8039d 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -63,6 +63,11 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass_num++; } + if (pass_name == "mkldnn_placement_pass") { + pass->Set("mkldnn_enabled_op_types", + new std::unordered_set( + argument->mkldnn_enabled_op_types())); + } if (pass_name == "tensorrt_subgraph_pass") { PADDLE_ENFORCE(argument->tensorrt_node_teller_valid()); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 384d1dc27d..dcefdd92f5 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -49,6 +49,10 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; + // For mkldnn + use_mkldnn_ = other.use_mkldnn_; + mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; + use_feed_fetch_ops = other.use_feed_fetch_ops; use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; @@ -77,6 +81,10 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; + // For mkldnn + use_mkldnn_ = other.use_mkldnn_; + mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; + use_feed_fetch_ops = other.use_feed_fetch_ops; use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 84f7eca057..be51e7fc1f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -327,6 +327,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); } + if (config_.use_mkldnn_) { + argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); + } + auto passes = config_.pass_builder()->AllPasses(); if (!config_.enable_ir_optim) passes.clear(); argument_.SetIrAnalysisPasses(passes); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index a08e3d027e..f05b9832da 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -16,6 +16,7 @@ #include #include #include +#include #include // Here we include some header files with relative paths, for that in deploy, @@ -53,6 +54,9 @@ struct AnalysisConfig : public NativeConfig { void EnableMKLDNN(); bool use_mkldnn() const { return use_mkldnn_; } + void SetMKLDNNOp(std::unordered_set op_list) { + mkldnn_enabled_op_types_ = op_list; + } // Specify the memory buffer of program and parameter void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, @@ -64,6 +68,7 @@ struct AnalysisConfig : public NativeConfig { protected: bool use_tensorrt_{false}; bool use_mkldnn_{false}; + std::unordered_set mkldnn_enabled_op_types_; int tensorrt_workspace_size_; int tensorrt_max_batchsize_; std::unique_ptr pass_builder_; diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index e8abcfce05..227e2ff458 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -194,6 +194,8 @@ void profile(bool use_mkldnn = false) { if (use_mkldnn) { cfg.EnableMKLDNN(); + std::unordered_set op_list = {"conv3d"}; + cfg.SetMKLDNNOp(op_list); } std::vector outputs; @@ -236,6 +238,8 @@ void compare(bool use_mkldnn = false) { SetConfig(&cfg); if (use_mkldnn) { cfg.EnableMKLDNN(); + std::unordered_set op_list = {"conv3d"}; + cfg.SetMKLDNNOp(op_list); } std::vector> input_slots_all; From bb2e7f0bbed1cfcf47b5b8e90bc9e35b46c13b50 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Sat, 8 Dec 2018 12:31:33 +0800 Subject: [PATCH 186/719] add scope in prefetch --- paddle/fluid/operators/distributed/parameter_prefetch.cc | 8 ++++---- paddle/fluid/operators/distributed/parameter_prefetch.h | 3 ++- paddle/fluid/operators/nce_op.h | 9 +++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 67b56bd218..f6a2d5bbe5 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -104,7 +104,7 @@ static void MergeMultipleVarsIntoOneBySection( const std::vector>& splited_ids, const framework::ExecutionContext& context, const framework::Scope& actual_scope, framework::Scope* scope, - platform::DeviceContext* actual_ctx, ) { + platform::DeviceContext* actual_ctx) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); auto cpu_place = platform::CPUPlace(); @@ -175,7 +175,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = scope.NewScope(); + auto& local_scope = context.scope().NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -192,7 +192,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, out_var_names.push_back(out_name + "@" + epmap[i]); } - auto& id_tensor = local_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); @@ -248,7 +248,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, context, scope, &local_scope, &actual_ctx); - scope.DeleteScope(&local_scope); + context.scope().DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53b0fbfb51..53482c4c40 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -27,7 +27,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context); + const framework::ExecutionContext& context, + const framework::Scope& scope); }; // namespace distributed }; // namespace operators diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 9789e30388..2e51c67401 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -180,7 +180,7 @@ class NCEKernel : public framework::OpKernel { labels.size() * sizeof(int64_t)); local_scope.Var("Weight@Local") - ->GetMutable() + ->GetMutable() ->mutable_data(context.GetPlace()); #ifdef PADDLE_WITH_DISTRIBUTE @@ -194,7 +194,7 @@ class NCEKernel : public framework::OpKernel { #endif auto weight_mat = EigenMatrix::From( - (local_scope.Var("Weight@Local")->Get())); + (local_scope.Var("Weight@Local")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); @@ -208,8 +208,9 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } - context.scope().DeleteScope(&local_scope); - + if (context.scope().HasKid(&local_scope)) { + context.scope().DeleteScope(&local_scope); + } } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); From 748549b2e31c510d462ebe1a5421e139269c1a3c Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 9 Dec 2018 12:31:54 +0800 Subject: [PATCH 187/719] Revert "Merge pull request #14798 from PaddlePaddle/revert-14786-revert-14782-revert-14398-imperative" This reverts commit b1d3a1c8b41fdb4cfcb58ec2d4fb938b09dac057, reversing changes made to f1fb64b17fb0290e7e1f110069de19b0ea0d0474. --- paddle/fluid/CMakeLists.txt | 1 + paddle/fluid/framework/feed_fetch_method.cc | 9 + paddle/fluid/framework/feed_fetch_method.h | 2 + paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/imperative/CMakeLists.txt | 3 + paddle/fluid/imperative/engine.cc | 53 +++++ paddle/fluid/imperative/engine.h | 39 ++++ paddle/fluid/imperative/layer.cc | 221 ++++++++++++++++++ paddle/fluid/imperative/layer.h | 102 ++++++++ paddle/fluid/imperative/tracer.cc | 19 ++ paddle/fluid/imperative/tracer.h | 128 ++++++++++ paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/imperative.cc | 36 +++ paddle/fluid/pybind/imperative.h | 53 +++++ paddle/fluid/pybind/pybind.cc | 39 ++++ python/paddle/fluid/__init__.py | 2 + python/paddle/fluid/framework.py | 54 ++++- python/paddle/fluid/imperative/__init__.py | 25 ++ python/paddle/fluid/imperative/base.py | 56 +++++ python/paddle/fluid/imperative/layers.py | 44 ++++ python/paddle/fluid/layer_helper.py | 23 +- python/paddle/fluid/layers/nn.py | 3 +- .../fluid/tests/unittests/test_imperative.py | 52 +++++ python/setup.py.in | 1 + tools/print_signatures.py | 4 + 25 files changed, 960 insertions(+), 19 deletions(-) create mode 100644 paddle/fluid/imperative/CMakeLists.txt create mode 100644 paddle/fluid/imperative/engine.cc create mode 100644 paddle/fluid/imperative/engine.h create mode 100644 paddle/fluid/imperative/layer.cc create mode 100644 paddle/fluid/imperative/layer.h create mode 100644 paddle/fluid/imperative/tracer.cc create mode 100644 paddle/fluid/imperative/tracer.h create mode 100644 paddle/fluid/pybind/imperative.cc create mode 100644 paddle/fluid/pybind/imperative.h create mode 100644 python/paddle/fluid/imperative/__init__.py create mode 100644 python/paddle/fluid/imperative/base.py create mode 100644 python/paddle/fluid/imperative/layers.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative.py diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 6b526f0103..595454e90b 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3e9353f5cf..6338be75a4 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { + Variable* var = scope.FindVar(var_name); + PADDLE_ENFORCE(var, "%s no in scope", var_name); + PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); + return *var->GetMutable(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 7f504bfd23..031f8e01aa 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bba..8679118fe2 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); + LOG(ERROR) << "Cannot add backward operator before forward operator " + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt new file mode 100644 index 0000000000..373d292b44 --- /dev/null +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -0,0 +1,3 @@ +cc_library(layer SRCS layer.cc DEPS proto_desc operator) +cc_library(tracer SRCS tracer.cc DEPS proto_desc) +cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc new file mode 100644 index 0000000000..de7ab0e591 --- /dev/null +++ b/paddle/fluid/imperative/engine.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/engine.h" + +#include // NOLINT +#include + +#include "glog/logging.h" + +namespace paddle { +namespace imperative { + +static std::once_flag init_engine; +static Engine* engine; + +class DummyEngine : public Engine { + public: + void Enqueue(Runnable* runnable) override { + queued_runnables_.push_back(runnable); + } + + size_t Size() const override { return queued_runnables_.size(); } + + void Sync() override { + for (Runnable* l : queued_runnables_) { + LOG(INFO) << "running " << reinterpret_cast(l); + } + queued_runnables_.clear(); + } + + private: + std::vector queued_runnables_; +}; + +Engine* GetEngine() { + std::call_once(init_engine, []() { engine = new DummyEngine(); }); + return engine; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h new file mode 100644 index 0000000000..a1dfa5bda3 --- /dev/null +++ b/paddle/fluid/imperative/engine.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace imperative { + +struct Runnable {}; + +class Engine { + public: + virtual ~Engine() {} + + virtual void Enqueue(Runnable* runnable) = 0; + + virtual size_t Size() const = 0; + + virtual void Sync() = 0; +}; + +Engine* GetEngine(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc new file mode 100644 index 0000000000..6125037680 --- /dev/null +++ b/paddle/fluid/imperative/layer.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/layer.h" +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace imperative { + +using framework::Variable; + +void AddTo(Variable* src, Variable* dst) { + framework::LoDTensor* dst_tensor = dst->GetMutable(); + framework::LoDTensor* src_tensor = src->GetMutable(); + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", + dst_tensor->numel(), src_tensor->numel()); + float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); + const float* src_data = src_tensor->data(); + for (size_t i = 0; i < src_tensor->numel(); ++i) { + dst_data[i] += src_data[i]; + } +} + +class Autograd { + public: + explicit Autograd(framework::Scope* scope) : scope_(scope) {} + + void RunBackward(VarBase* var) { + PADDLE_ENFORCE(var->pre_op_->op_desc_); + // TODO(panyx0718): Only create for vars that "require_grad" + (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; + + std::deque ready; + ready.push_back(var->pre_op_); + + std::map dep_counts = ComputeDepCounts(var->pre_op_); + + while (!ready.empty()) { + OpBase* ready_op = ready.front(); + ready.pop_front(); + std::vector input_grads = ready_op->ApplyGrad(scope_); + + for (size_t i = 0; i < input_grads.size(); ++i) { + if (!input_grads[i]) continue; + OpBase* pre_op = ready_op->pre_ops_->at(i); + if (!pre_op) continue; + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } + } + } + } + + private: + std::map ComputeDepCounts(OpBase* op) { + std::map ret; + + std::deque queue; + queue.push_back(op); + std::unordered_set visited; + visited.insert(op); + while (!queue.empty()) { + OpBase* candidate = queue.front(); + queue.pop_front(); + for (OpBase* pre_op : *(candidate->pre_ops_)) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; + } + } + + return ret; + } + + framework::Scope* scope_; +}; + +framework::Variable* CreateVariable(const std::string& name, + const framework::DDim& dim, float val, + framework::Scope* scope, + bool random_name = true) { + std::string varname = name; + if (random_name) { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + int id = dist6(rng); + varname = string::Sprintf("%s@%d", varname, id); + } + + VLOG(3) << "creating var " << varname; + framework::Variable* var = scope->Var(varname); + framework::LoDTensor* tensor = var->GetMutable(); + + float* data = tensor->mutable_data(dim, platform::CPUPlace()); + std::fill(data, data + tensor->numel(), val); + return var; +} + +framework::LoDTensor& VarBase::Grad() { + VLOG(3) << "get var grad " << var_desc_->Name(); + return *grads_->GetMutable(); +} + +void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { + VLOG(3) << "apply var grad " << var_desc_->Name() << " " + << grad->Get().data()[0]; + if (!grads_) { + grads_ = + CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), + var_->Get().dims(), 0.0, scope); + } + AddTo(grad, grads_); + VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " + << grads_->Get().data()[0]; +} + +std::vector OpBase::ApplyGrad(framework::Scope* scope) { + VLOG(3) << "op grad " << grad_op_desc_->Type(); + + for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { + if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { + // grad op inputs can be forward inputs, so not in grad_to_var. + continue; + } + VLOG(3) << "op grad in var " << grad_invar; + block_->FindRecursiveOrCreateVar(grad_invar); + framework::Variable* var = scope->Var(grad_invar); + const std::string& invar = grad_to_var_->at(grad_invar); + for (VarBase* varbase : *output_vars_) { + // Use the accumulated grads_ by sharing the input with grads_. + if (varbase->var_desc_->Name() == invar) { + var->GetMutable()->ShareDataWith( + varbase->grads_->Get()); + break; + } + } + } + + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + VLOG(3) << "grad outvar " << outvar; + block_->FindRecursiveOrCreateVar(outvar); + framework::Variable* var = scope->Var(outvar); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(outvar); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + } + grad_op_desc_->InferShape(*block_); + grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc_); + + opbase->Run(*scope, platform::CPUPlace()); + + // `ret` matches exactly with `input_vars_` of forward op. + std::vector ret; + for (size_t i = 0; i < input_vars_->size(); ++i) { + bool found = false; + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + Variable* var = scope->FindVar(outvar); + VarBase* origin_var = (*input_vars_)[i]; + std::string orig_var = grad_to_var_->at(outvar); + PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; + origin_var->ApplyGrad(scope, var); + found = true; + ret.push_back(var); + // TODO(panyx0718): There might be another outvar with the same name. + // In that case, it doesn't matter the first one or the second one is + // used. + break; + } + if (!found) { + ret.push_back(nullptr); + } + } + return ret; +} + +void VarBase::RunBackward(framework::Scope* scope) { + grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), + var_->Get().dims(), 1.0, scope, + false); + if (!pre_op_) return; + Autograd(scope).RunBackward(this); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h new file mode 100644 index 0000000000..85a71ca83d --- /dev/null +++ b/paddle/fluid/imperative/layer.h @@ -0,0 +1,102 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace imperative { + +class OpBase; + +class VarBase { + public: + VarBase() + : pre_op_(nullptr), + pre_op_out_idx_(-1), + var_desc_(nullptr), + var_(nullptr), + grads_(nullptr) {} + + virtual ~VarBase() {} + + void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + + void RunBackward(framework::Scope* scope); + + framework::LoDTensor& Grad(); + + OpBase* pre_op_; + int pre_op_out_idx_; + + framework::VarDesc* var_desc_; + framework::Variable* var_; + framework::Variable* grads_; +}; + +class OpBase { + public: + OpBase() + : input_vars_(new std::vector()), + output_vars_(new std::vector()), + pre_ops_(new std::vector()), + pre_ops_out_idx_(new std::vector()), + op_desc_(nullptr), + grad_op_desc_(nullptr) {} + + virtual ~OpBase() { + delete input_vars_; + delete output_vars_; + + delete pre_ops_; + delete pre_ops_out_idx_; + + if (grad_op_desc_) delete grad_op_desc_; + if (grad_to_var_) delete grad_to_var_; + } + + std::vector ApplyGrad(framework::Scope* scope); + + std::vector* input_vars_; + std::vector* output_vars_; + std::vector* pre_ops_; + std::vector* pre_ops_out_idx_; + framework::OpDesc* op_desc_; + + framework::OpDesc* grad_op_desc_; + std::unordered_map* grad_to_var_; + framework::BlockDesc* block_; +}; + +class Layer { + public: + virtual ~Layer() {} + + virtual std::vector Forward(const std::vector& inputs) { + std::vector vars; + return vars; + } + + virtual void Backward() { LOG(ERROR) << "To support customize"; } +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc new file mode 100644 index 0000000000..f64f9e72c4 --- /dev/null +++ b/paddle/fluid/imperative/tracer.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace imperative {} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h new file mode 100644 index 0000000000..433d07c0e5 --- /dev/null +++ b/paddle/fluid/imperative/tracer.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/engine.h" +#include "paddle/fluid/imperative/layer.h" + +namespace paddle { +namespace imperative { + +void CreateGradOp(const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + framework::OpDesc** grad_op_desc, + std::unordered_map* grad_to_var) { + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); + // TODO(panyx0718): Leak? + *grad_op_desc = grad_op_descs[0].release(); +} + +class Tracer { + public: + explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + root_scope_ = new framework::Scope(); + scopes_[root_block_] = root_scope_; + } + + virtual ~Tracer() { delete root_scope_; } + + void Trace(OpBase* op, const std::vector& inputs, + const std::vector& outputs, + framework::BlockDesc* block) { + framework::Scope* scope = GetScope(block); + framework::OpDesc* op_desc = op->op_desc_; + VLOG(3) << "tracer tracing " << op_desc->Type(); + op_desc->InferShape(*block); + op_desc->InferVarType(block); + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(*op_desc); + + *op->input_vars_ = inputs; + for (VarBase* input : inputs) { + const std::string vname = input->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + input->var_ = var; + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + if (input->pre_op_) { + op->pre_ops_->push_back(input->pre_op_); + op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); + } else { + op->pre_ops_->push_back(nullptr); + } + } + + *op->output_vars_ = outputs; + for (size_t i = 0; i < outputs.size(); ++i) { + const std::string vname = outputs[i]->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + outputs[i]->var_ = var; + outputs[i]->pre_op_ = op; + outputs[i]->pre_op_out_idx_ = i; + } + op_base->Run(*scope, platform::CPUPlace()); + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + op->block_ = block; + } + + framework::Scope* GetScope(framework::BlockDesc* block) { + if (scopes_.find(block) != scopes_.end()) { + return scopes_.at(block); + } + framework::BlockDesc* parent_block = block->ParentBlock(); + PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); + framework::Scope* scope = &scopes_[parent_block]->NewScope(); + scopes_[block] = scope; + return scope; + } + + private: + std::map scopes_; + framework::BlockDesc* root_block_; + framework::Scope* root_scope_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d602613fc8..b8954cb126 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,7 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) + if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc new file mode 100644 index 0000000000..34e9c897d9 --- /dev/null +++ b/paddle/fluid/pybind/imperative.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace pybind { + +// Bind Methods +void BindTracer(pybind11::module *m) { + pybind11::class_(*m, "Tracer", "") + .def("__init__", + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); + }) + .def("trace", &imperative::Tracer::Trace) + .def("get_scope", &imperative::Tracer::GetScope, + pybind11::return_value_policy::reference); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h new file mode 100644 index 0000000000..7a9d3a01ea --- /dev/null +++ b/paddle/fluid/pybind/imperative.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/fluid/imperative/layer.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +class PyLayer : public imperative::Layer { + public: + using imperative::Layer::Layer; // Inherit constructors + + std::vector Forward( + const std::vector& inputs) override { + PYBIND11_OVERLOAD(std::vector, Layer, Forward, + inputs); // NOLINT + } + + void Backward() override { + PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT + } +}; + +class PyOpBase : public imperative::OpBase { + public: + using imperative::OpBase::OpBase; // Inherit constructors +}; + +class PyVarBase : public imperative::VarBase { + public: + using imperative::VarBase::VarBase; // Inherit constructors +}; + +void BindTracer(pybind11::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b2..dca0c01ab2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -45,6 +46,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) { BindException(&m); + py::class_(m, "VarBase", R"DOC()DOC") + .def(py::init<>()) + .def("_run_backward", + [](imperative::VarBase &self, framework::Scope *scope) { + self.RunBackward(scope); + }) + .def("_grad", &imperative::VarBase::Grad) + .def_property( + "desc", + [](const imperative::VarBase &self) { return self.var_desc_; }, + [](imperative::VarBase &self, framework::VarDesc *var_desc) { + self.var_desc_ = var_desc; + }, + py::return_value_policy::reference); + + py::class_(m, "OpBase", R"DOC()DOC") + .def(py::init<>()) + .def_property( + "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, + [](imperative::OpBase &self, framework::OpDesc *op_desc) { + if (op_desc) { + self.op_desc_ = op_desc; + } + }, + py::return_value_policy::reference); + + py::class_ layer(m, "Layer"); + layer.def(py::init<>()) + .def("forward", + [](imperative::Layer &self, + const std::vector &inputs) { + return self.Forward(inputs); + }) + .def("backward", &imperative::Layer::Backward); + BindTracer(&m); + py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -601,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a53519188..52417a1eaf 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,6 +34,7 @@ from . import io from . import evaluator from . import initializer from . import layers +from . import imperative from . import contrib from . import nets from . import optimizer @@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', + 'imperative', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index a40826168d..1511eea68c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,6 +18,7 @@ import collections import contextlib import re import six +import sys import numpy as np @@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() +_imperative_tracer_ = None + + +def _in_imperative_mode(): + return _imperative_tracer_ is not None + + +def _imperative_tracer(): + return _imperative_tracer_ + class NameScope(object): def __init__(self, name="", parent=None): @@ -202,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(object): +class Variable(core.VarBase): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -266,6 +277,7 @@ class Variable(object): stop_gradient=False, is_data=False, **kwargs): + core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -346,6 +358,18 @@ class Variable(object): self.stop_gradient = stop_gradient self.is_data = is_data + def _numpy(self): + scope = _imperative_tracer().get_scope(self.block.desc) + tensor = core.get_variable_tensor(scope, self.desc.name()) + return np.array(tensor) + + def _backward(self): + scope = _imperative_tracer().get_scope(self.block.desc) + self._run_backward(scope) + + def _gradient(self): + return np.array(self._grad()) + def __str__(self): return self.to_string(True) @@ -492,7 +516,7 @@ class OpProtoHolder(object): } -class Operator(object): +class Operator(core.OpBase): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -548,6 +572,7 @@ class Operator(object): inputs=None, outputs=None, attrs=None): + core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -587,6 +612,7 @@ class Operator(object): return True return False + self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -613,6 +639,13 @@ class Operator(object): else: self.desc.set_input(in_proto.name, []) + for inp in inputs.values(): + if isinstance(inp, Variable): + self.inputs.append(inp) + elif isinstance(inp, list) or isinstance(inp, tuple): + self.inputs.extend(inp[:]) + + self.outputs = [] if outputs is not None: given = set() need = set() @@ -641,6 +674,12 @@ class Operator(object): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) + for out in outputs.values(): + if isinstance(out, Variable): + self.outputs.append(out) + elif isinstance(out, list) or isinstance(out, tuple): + self.outputs.extend(out[:]) + if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -1206,6 +1245,8 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -2210,3 +2251,12 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) + + +@contextlib.contextmanager +def _imperative_guard(tracer): + global _imperative_tracer_ + tmp_trace = _imperative_tracer_ + _imperative_tracer_ = tracer + yield + _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py new file mode 100644 index 0000000000..922308b6b1 --- /dev/null +++ b/python/paddle/fluid/imperative/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import base +from .base import * + +from . import layers +from .layers import * + +__all__ = [] +__all__ += layers.__all__ +__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py new file mode 100644 index 0000000000..15d38ddb56 --- /dev/null +++ b/python/paddle/fluid/imperative/base.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework + +__all__ = ['enabled', 'guard', 'to_variable'] + + +def enabled(): + return framework._in_imperative_mode() + + +@contextlib.contextmanager +def guard(): + train = framework.Program() + startup = framework.Program() + tracer = core.Tracer(train.current_block().desc) + with framework.program_guard(train, startup): + with framework.unique_name.guard(): + with framework._imperative_guard(tracer): + yield + + +def to_variable(value, block=None): + if isinstance(value, np.ndarray): + if not block: + block = framework.default_main_program().current_block() + py_var = framework.Variable( + block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=value.shape, + dtype=value.dtype) + scope = framework._imperative_tracer().get_scope(block.desc) + var = scope.var(py_var.name) + tensor = var.get_tensor() + tensor.set(value, core.CPUPlace()) + return py_var + elif isinstance(value, framework.Variable): + return value + else: + raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py new file mode 100644 index 0000000000..1a28f7f4ae --- /dev/null +++ b/python/paddle/fluid/imperative/layers.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import sys +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework +from paddle.fluid.imperative import base + +__all__ = ['PyLayer'] + + +class PyLayer(core.Layer): + def __init__(self): + pass + + def __call__(self, inputs): + # TODO(panyx0718): Support declarative mode as well. + assert base.enabled() + if not isinstance(inputs, list) and not isinstance(inputs, tuple): + inputs = [inputs] + + var_inputs = [] + for x in inputs: + py_var = base.to_variable(x) + var_inputs.append(py_var) + outputs = self.forward(var_inputs) + return outputs + + def forward(self, inputs): + return [] diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index dc317de9ab..74b4a977db 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,10 +17,13 @@ from __future__ import print_function import copy import itertools import six +import sys +import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier +from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -46,23 +49,21 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() + def to_variable(self, x): + return base.to_variable(x, self.main_program.current_block()) + def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - type_error = TypeError( - "Input of {0} layer should be Variable or sequence of Variable". - format(self.layer_type)) - if isinstance(inputs, Variable): - inputs = [inputs] - elif not isinstance(inputs, list) and not isinstance(inputs, tuple): - raise type_error + ret = [] + if isinstance(inputs, list) or isinstance(inputs, tuple): + for inp in inputs: + ret.append(self.to_variable(inp)) else: - for each in inputs: - if not isinstance(each, Variable): - raise type_error - return inputs + ret.append(self.to_variable(inputs)) + return ret def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4833212d31..fac7538a6a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6623,7 +6623,8 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) + helper.append_op( + type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py new file mode 100644 index 0000000000..b5b6305155 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import sys +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import core + + +class MyLayer(fluid.imperative.PyLayer): + def __init__(self): + super(MyLayer, self).__init__() + + def forward(self, inputs): + x = fluid.layers.relu(inputs[0]) + self._x_for_debug = x + return [fluid.layers.elementwise_mul(x, x)] + + +class TestImperative(unittest.TestCase): + def test_layer(self): + with fluid.imperative.guard(): + cl = core.Layer() + cl.forward([]) + l = fluid.imperative.PyLayer() + l.forward([]) + + def test_layer_in_out(self): + with fluid.imperative.guard(): + l = MyLayer() + x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + self.assertIsNotNone(x) + sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + x._backward() + sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 5aee26b638..0eb69cdb5c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,6 +101,7 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', + 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 5c5266f904..7e61dde0a4 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -27,6 +27,8 @@ import pydoc member_dict = collections.OrderedDict() +experimental_namespace = {"paddle.fluid.imperative"} + def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -51,6 +53,8 @@ def visit_member(parent_name, member): def visit_all_module(mod): + if (mod.__name__ in experimental_namespace): + return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) From 68c2025844d8a33fa229d60cad431baf86ee1d91 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 9 Dec 2018 12:35:36 +0800 Subject: [PATCH 188/719] fix nn.py&API.spec, test=develop --- paddle/fluid/API.spec | 14 ++++++++++++++ python/paddle/fluid/layers/nn.py | 15 +++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9a90ad4e93..a61b93af35 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -423,3 +423,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) +paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) +paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) +paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) +paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) +paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) +paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) +paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) +paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) +paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) +paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5499a0ba83..9233fe130e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1350,6 +1350,21 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): def bpr_loss(input, label_pos): + """ + Bayesian Personalized Ranking Loss Operator. + + This operator belongs to pairwise ranking loss. LabelPos is the desired item. + The loss at a given point in one session is defined as: + $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j(https://arxiv.org/abs/1511.06939) + + Examples: + .. code-block:: python + + cost = fluid.layers.bpr_loss(input=predict, label_pos=label) + """ helper = LayerHelper('bpr_loss', **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) From f6dc09e98b85065594eb5faa9752c13133d73ff7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 9 Dec 2018 13:28:14 +0800 Subject: [PATCH 189/719] void hurting declarative performance test=develop --- python/paddle/fluid/framework.py | 47 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 1511eea68c..4bf0a456b5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -213,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True): return proto.__str__() -class Variable(core.VarBase): +class Variable(object): """ In Fluid, every input and output of an operator is a variable. In most cases, variables are used for holding different kinds of data or training @@ -277,7 +277,6 @@ class Variable(core.VarBase): stop_gradient=False, is_data=False, **kwargs): - core.VarBase.__init__(self) self.block = block self.error_clip = error_clip @@ -357,6 +356,9 @@ class Variable(core.VarBase): self.op = None self.stop_gradient = stop_gradient self.is_data = is_data + if _in_imperative_mode(): + self._ivar = core.VarBase() + self._ivar.desc = self.desc def _numpy(self): scope = _imperative_tracer().get_scope(self.block.desc) @@ -365,10 +367,10 @@ class Variable(core.VarBase): def _backward(self): scope = _imperative_tracer().get_scope(self.block.desc) - self._run_backward(scope) + self._ivar._run_backward(scope) def _gradient(self): - return np.array(self._grad()) + return np.array(self._ivar._grad()) def __str__(self): return self.to_string(True) @@ -516,7 +518,7 @@ class OpProtoHolder(object): } -class Operator(core.OpBase): +class Operator(object): """ In Fluid, all the operation are represented by Operator, and Operator is regarded as a build in an instruction of a Block. Users can use the @@ -572,7 +574,6 @@ class Operator(core.OpBase): inputs=None, outputs=None, attrs=None): - core.OpBase.__init__(self) self.block = block self.desc = desc # note: not add self.attrs here: @@ -612,7 +613,6 @@ class Operator(core.OpBase): return True return False - self.inputs = [] if inputs is not None: for in_proto in proto.inputs: found = find_name(inputs, in_proto.name) @@ -639,13 +639,6 @@ class Operator(core.OpBase): else: self.desc.set_input(in_proto.name, []) - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - - self.outputs = [] if outputs is not None: given = set() need = set() @@ -674,12 +667,6 @@ class Operator(core.OpBase): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) - if op_attrs is not None: if not isinstance(op_attrs, dict): raise TypeError("'attrs' should be a dict.") @@ -694,6 +681,23 @@ class Operator(core.OpBase): if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + if _in_imperative_mode(): + self.iop = core.OpBase() + self.iop.desc = self.desc + self.inputs = [] + if inputs is not None: + for inp in inputs.values(): + if isinstance(inp, Variable): + self.inputs.append(inp) + elif isinstance(inp, list) or isinstance(inp, tuple): + self.inputs.extend(inp[:]) + self.outputs = [] + if outputs is not None: + for out in outputs.values(): + if isinstance(out, Variable): + self.outputs.append(out) + elif isinstance(out, list) or isinstance(out, tuple): + self.outputs.extend(out[:]) def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET @@ -1246,7 +1250,8 @@ class Block(object): op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) if _in_imperative_mode(): - _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc) + _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], + [v._ivar for v in op.outputs], self.desc) self.ops.append(op) return op From ea95f9c335bc8336a840190c2cdea543e87e4f71 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 9 Dec 2018 21:40:04 +0800 Subject: [PATCH 190/719] fix style bug, test=develop --- paddle/fluid/operators/bpr_loss_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 41f2969e6c..075b1b2c76 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -28,12 +28,12 @@ class BprLossOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_Pos_dims = ctx->GetInputDim("LabelPos"); + auto label_pos_dims = ctx->GetInputDim("LabelPos"); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, label_Pos_dims.size(), + PADDLE_ENFORCE_EQ(rank, label_pos_dims.size(), "Input(X) and Input(LabelPos) shall have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_Pos_dims, 0, rank - 1), + framework::slice_ddim(label_pos_dims, 0, rank - 1), "Input(X) and Input(LabelPos) shall have the same shape " "except the last dimension."); From a672b291e5bbee4604e74ce2c46991b4709fed1d Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 9 Dec 2018 22:08:44 +0800 Subject: [PATCH 191/719] fix code style, test=develop --- paddle/fluid/operators/bpr_loss_op.h | 40 +++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index ea817bb239..ab68165942 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -22,7 +22,9 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; - +/*Todo: + *Find a way to adapt TolerableValue, using blas or eigen. + */ template struct TolerableValue { HOSTDEVICE T operator()(const T& x) const { @@ -86,27 +88,27 @@ class BprLossGradientOpKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); const int step_size = x->dims()[0]; - const int num_classes_ = x->dims()[1]; - T* dx_ = dx->mutable_data(ctx.GetPlace()); - const T* dy_ = dy->data(); - const T* x_ = x->data(); - const int64_t* label_pos_ = label_pos->data(); + const int num_classes = x->dims()[1]; + T* dx_data = dx->mutable_data(ctx.GetPlace()); + const T* dy_data = dy->data(); + const T* x_data = x->data(); + const int64_t* label_pos_data = label_pos->data(); for (size_t sample_id = 0; sample_id < step_size; sample_id++) { - for (size_t x_offset = sample_id * num_classes_; - x_offset < (sample_id + 1) * num_classes_; x_offset++) { - dx_[x_offset] = static_cast(0); + for (size_t x_offset = sample_id * num_classes; + x_offset < (sample_id + 1) * num_classes; x_offset++) { + dx_data[x_offset] = static_cast(0); } - auto p_index = sample_id * num_classes_ + label_pos_[sample_id]; - for (size_t ni = 0; ni < num_classes_; ni++) { - if (label_pos_[sample_id] == ni) continue; - auto n_index = sample_id * num_classes_ + ni; - auto grad_ = - -dy_[sample_id] / - ((num_classes_ - 1) * - (1.0f + TolerableValue()(std::exp(x_[p_index] - x_[n_index])))); - dx_[p_index] += grad_; - dx_[n_index] -= grad_; + auto p_index = sample_id * num_classes + label_pos_data[sample_id]; + for (size_t ni = 0; ni < num_classes; ni++) { + if (label_pos_data[sample_id] == ni) continue; + auto n_index = sample_id * num_classes + ni; + auto grad_ = -dy_data[sample_id] / + ((num_classes - 1) * + (1.0f + TolerableValue()(std::exp(x_data[p_index] - + x_data[n_index])))); + dx_data[p_index] += grad_; + dx_data[n_index] -= grad_; } } } From f0c0bf328d8141fd041d5d69521d0995e58b2625 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 10 Dec 2018 10:47:59 +0800 Subject: [PATCH 192/719] Add gperftools supports for PE --- CMakeLists.txt | 8 ++- cmake/FindGperftools.cmake | 63 +++++++++++++++++++++ cmake/generic.cmake | 16 ++++++ paddle/fluid/framework/parallel_executor.cc | 30 +++++++++- 4 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 cmake/FindGperftools.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index efa68c9ba2..1594e798a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,7 @@ option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) -option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler" OFF) +option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) @@ -254,6 +254,12 @@ elseif() set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE) endif() +if (WITH_PROFILER) + find_package(Gperftools REQUIRED) + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + add_definitions(-DWITH_GPERFTOOLS) +endif() + include(generic) # simplify cmake module include(package) # set paddle packages include(ccache) # set ccache for compilation diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake new file mode 100644 index 0000000000..928f573a4f --- /dev/null +++ b/cmake/FindGperftools.cmake @@ -0,0 +1,63 @@ +# Tries to find Gperftools. +# +# Usage of this module as follows: +# +# find_package(Gperftools) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Gperftools_ROOT_DIR Set this variable to the root installation of +# Gperftools if the module has problems finding +# the proper installation path. +# +# Variables defined by this module: +# +# GPERFTOOLS_FOUND System has Gperftools libs/headers +# GPERFTOOLS_LIBRARIES The Gperftools libraries (tcmalloc & profiler) +# GPERFTOOLS_INCLUDE_DIR The location of Gperftools headers + +find_library(GPERFTOOLS_TCMALLOC + NAMES tcmalloc + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_PROFILER + NAMES profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER + NAMES tcmalloc_and_profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_path(GPERFTOOLS_INCLUDE_DIR + NAMES gperftools/heap-profiler.h + HINTS ${Gperftools_ROOT_DIR}/include) + +set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + Gperftools + DEFAULT_MSG + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) + +mark_as_advanced( + Gperftools_ROOT_DIR + GPERFTOOLS_TCMALLOC + GPERFTOOLS_PROFILER + GPERFTOOLS_TCMALLOC_AND_PROFILER + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) + +# create IMPORTED targets +if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc) + add_library(gperftools::tcmalloc UNKNOWN IMPORTED) + set_target_properties(gperftools::tcmalloc PROPERTIES + IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") + add_library(gperftools::profiler UNKNOWN IMPORTED) + set_target_properties(gperftools::profiler PROPERTIES + IMPORTED_LOCATION ${GPERFTOOLS_PROFILER} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") +endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 312fbaa0b3..a8b9dcfcf5 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME) endif() endfunction(find_fluid_modules) + +function(common_link TARGET_NAME) + if (WITH_PROFILER) + target_link_libraries(${TARGET_NAME} gperftools::profiler) + endif() +endfunction() + + # find all third_party modules is used for paddle static library # for reduce the dependency when building the inference libs. set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) @@ -274,6 +282,7 @@ function(cc_library TARGET_NAME) endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + common_link(${TARGET_NAME}) endif() # cpplint code style @@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME) if(cc_binary_DEPS) target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endfunction(cc_binary) @@ -362,6 +372,7 @@ function(cc_test TARGET_NAME) target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) @@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(nv_binary) @@ -433,6 +445,7 @@ function(nv_test TARGET_NAME) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) @@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME) if(hip_binary_DEPS) target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) add_dependencies(${TARGET_NAME} ${hip_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(hip_binary) @@ -518,6 +532,7 @@ function(hip_test TARGET_NAME) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(hip_test) @@ -560,6 +575,7 @@ function(go_library TARGET_NAME) endif() if(go_library_DEPS) add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + common_link(${TARGET_NAME}) endif(go_library_DEPS) # The "source file" of the library is `${dummyfile}` which never diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee77..9355bb572b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -30,13 +30,33 @@ limitations under the License. */ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" +#ifdef WITH_GPERFTOOLS +#include "google/gperftools.h" +#endif +DEFINE_string(PEProfileFName, "", + "Profiler filename for PE, which generated by gperftools." + "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); + namespace paddle { namespace framework { - +static std::once_flag gProfileOnce; +static bool gProfileStarted = false; class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) - : places_(places) {} + : places_(places) { + if (!FLAGS_PEProfileFName.empty()) { + std::call_once(gProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_PEProfileFName.c_str()); + gProfileStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_PEProfileFName will be ignored"; +#endif + }); + } + } ~ParallelExecutorPrivate() { if (own_local_scope_) { @@ -270,6 +290,12 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { +#ifdef WITH_GPERFTOOLS + if (gProfileStarted) { + ProfilerFlush(); + } +#endif + platform::RecordBlock b(0); #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { From 6bc0efb489411bb1b3206db0cbb03951811fa988 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 10:53:14 +0800 Subject: [PATCH 193/719] refine interface --- python/paddle/fluid/async_executor.py | 42 +++++++++++++------- python/paddle/fluid/distributed/downpour.py | 12 ++++-- python/paddle/fluid/distributed/helper.py | 30 ++++++++------ python/paddle/fluid/distributed/node.py | 44 +++++++++++++++++---- python/paddle/fluid/distributed/ps_pb2.py | 6 +-- 5 files changed, 93 insertions(+), 41 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index f667ff2424..3451d1edb5 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -24,6 +24,7 @@ from paddle.fluid.proto import data_feed_pb2 from google.protobuf import text_format from . import io from .data_feed_desc import DataFeedDesc +from .distributed import ps_instance __all__ = ['AsyncExecutor'] @@ -85,6 +86,7 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) + self.instance = ps_instance.PaddlePSInstance("init_param", 1, 2) def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): """ @@ -149,27 +151,39 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) + self.instance.barrier_all() def config_distributed_nodes(self, dist_opt): + # get total rank # get rank index # get iplists # get hadoop info - return - - - def init_server(self, filename, index): - self.executor.init_server(filename, index) - - def init_worker(self, filename, ips, nodes_cnt, index): - self.executor.init_worker(filename, ips, nodes_cnt, index) + pass + + def get_instance(self): + return self.instance + + def init_server(self, dist_desc): + self.executor.init_server(dist_desc, self.instance._rankid) + ip = self.executor.start_server() + self.instance.set_ip(ip) + self.instance.barrier_all() #wait all server start + ips = self.instance.gather_ips() + self.executor.gather_servers(ips, self.instance.get_node_cnt()) + self.instance.barrier_all() #wait all worker start + self.instance.barrier_all() #wait init model + self.instance.barrier_all() #wait worker do all things + + def init_worker(self, dist_desc): + self.instance.barrier_all() #wait all server start + ips = self.instance.gather_ips() + self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) + self.instance.barrier_all() #wait all worker start + if self.instance.is_first_worker(): + self.executor.init_model() + self.instance.barrier_all() #wait init model - def start_server(self): - return self.executor.start_server() - - def gather_servers(self, ips, nodes_cnt): - self.executor.gather_servers(ips, nodes_cnt) - def init_model(self): self.executor.init_model() diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 3d940b62b0..654fa6fab6 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -46,14 +46,20 @@ class DownpourSGD(object): sparse_table_index = 0 # currently merge all dense parameters into one dense table dense_table_index = 1 + params = [] + grads = [] + for i in params_grads: + params.append(i[0]) + for i in params_grads: + grads.append(i[1]) server.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) server.add_dense_table(dense_table_index, self.learning_rate_, - params_grads[0], params_grads[1]) + params, grads) worker.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) worker.add_dense_table(dense_table_index, self.learning_rate_, - params_grads[0], params_grads[1]) + params, grads) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) ps_param.trainer_param.CopyFrom(worker.get_desc()) @@ -61,4 +67,4 @@ class DownpourSGD(object): # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param_str = text_format.MessageToString(ps_param) - return [ps_param_str, worker_skipped_ops] + return [ps_param, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 12e2f7f197..4cc5eb2a92 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -1,4 +1,5 @@ from mpi4py import MPI +import ps_pb2 as pslib class FileSystem(object): def __init__(self, fs_type="afs", @@ -7,20 +8,23 @@ class FileSystem(object): passwd=None, hadoop_bin="", afs_conf=None): - assert user not None - assert passwd not None - assert hadoop_bin not None - fs_client = pslib.FsClientParameter() - if fs_type == "afs": - fs_client.fs_type = pslib.FsApiType.AFS - else: - fs_client.fs_type = pslib.FsApiType.HDFS - fs_client.uri = uri - fs_client.user = user - fs_client.passwd = passwd - fs_client.buffer_size = 0 - fs_client.afs_conf = afs_conf if not afs_conf else "" + assert user != None + assert passwd != None + assert hadoop_bin != None + self.fs_client = pslib.FsClientParameter() + #if fs_type == "afs": + # fs_client.fs_type = pslib.FsApiType.AFS + #else: + # fs_client.fs_type = pslib.FsApiType.HDFS + self.fs_client.uri = uri + self.fs_client.user = user + self.fs_client.passwd = passwd + #self.fs_client.buffer_size = 0 + self.fs_client.hadoop_bin = hadoop_bin + #self.fs_client.afs_conf = afs_conf if not afs_conf else "" + def get_desc(self): + return self.fs_client class MPIHelper(object): def __init__(self): diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index b96a15a32f..c245dc4db8 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -13,24 +13,52 @@ class Worker(object): class DownpourServer(Server): def __init__(self): self.server_ = pslib.ServerParameter() + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" + self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" + self.server_.downpour_server_param.service_param.service_class = "DownpourPsService" + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_thread_num = 12 def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_var): table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id + table.table_class = "DownpourSparseTable" table.type = pslib.PS_SPARSE_TABLE table.accessor.accessor_class = "DownpourFeatureValueAccessor" - table.accessor.dense_sgd_param.adam.learning_rate = learning_rate - table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, - slot_value_var[0].shape, 1)) + table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.sparse_sgd_param.initial_g2sum = 3 + table.accessor.sparse_sgd_param.initial_range = 1e-4 + table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10]) + + table.accessor.embedx_dim = 8 + table.accessor.embedx_threshold = 5 + table.accessor.fea_dim = 11 + #table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, + # slot_value_var[0].shape, 1)) + table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 + table.accessor.downpour_accessor_param.click_coeff = 2 + table.accessor.downpour_accessor_param.base_threshold = 0.2 + table.accessor.downpour_accessor_param.delta_threshold = 0.15 + table.accessor.downpour_accessor_param.delta_keep_days = 31 + table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999 + table.accessor.downpour_accessor_param.delete_threshold = 0.8 def add_dense_table(self, table_id, learning_rate, param_var, grad_var): table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id + table.table_class = "DownpourDenseTable" table.type = pslib.PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" - table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.dense_sgd_param.name = "adam" + table.accessor.dense_sgd_param.adam.learning_rate = learning_rate + table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 + table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 + table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8 + table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 + table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 fea_dim = 0 for param in param_var: fea_dim += reduce(lambda x, y: x * y, param.shape, 1) @@ -44,8 +72,8 @@ class DownpourWorker(Worker): def __init__(self, window): self.window = window self.worker_ = pslib.DownpourTrainerParameter() - self.worker_.pull_dense_per_batch = window - self.worker_.push_dense_per_batch = window + #self.worker_.pull_dense_per_batch = window + #self.worker_.push_dense_per_batch = window def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_vars): @@ -62,8 +90,8 @@ class DownpourWorker(Worker): param_vars, grad_vars): table = self.worker_.dense_table.add() table.table_id = table_id - table.dense_variable_name.extend([p.name for p in param_vars]) - table.dense_gradient_variable_name.extend([g.name for g in grad_vars]) + table.dense_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [p.name for p in param_vars])) + table.dense_gradient_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [g.name for g in grad_vars])) def get_desc(self): return self.worker_ diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index f33ec50f7d..b82c649e14 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -531,21 +531,21 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( _descriptor.FieldDescriptor( name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, number=1, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("AbacusBrpcPsServer").decode('utf-8'), + has_default_value=True, default_value=_b("DownpourBrpcPsServer").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, number=2, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("AbacusBrpcPsClient").decode('utf-8'), + has_default_value=True, default_value=_b("DownpourBrpcPsClient").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, number=3, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("AbacusPsService").decode('utf-8'), + has_default_value=True, default_value=_b("DownpourPsService").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), From 86e1044ab941d627362d0def4ad45a250178a736 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 10:54:25 +0800 Subject: [PATCH 194/719] refine interface & add ps_instance --- .../paddle/fluid/distributed/ps_instance.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 python/paddle/fluid/distributed/ps_instance.py diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py new file mode 100644 index 0000000000..b4045327e1 --- /dev/null +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -0,0 +1,108 @@ +#import paddle.fluid.distributed.helper as dist_helper +import helper as dist_helper +import sys +#from mpi4py import MPI + + +class PaddlePSInstance(object): + def __init__(self, init_param, server_worker_mode, proc_per_node): + self.dh = dist_helper.MPIHelper() + self._config = init_param + self._rankid = self.dh.get_rank() + self._server_worker_mode = server_worker_mode + self._proc_per_node = proc_per_node + self._nodes = self.dh.get_size() + + self._ip = 0 + self._worker_num = self._nodes * self._proc_per_node / 2 + self._server_num = self._nodes * self._proc_per_node / 2 + self._total_server_worker = self._worker_num + self._server_num + self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 + self._set_nodetype() + self._comm = None + self._split_comm() + + + def _set_nodetype(self): + if self._server_worker_mode == 0: + if self._rankid < self._server_num: + self._node_type = 1 + elif self._rankid < self._total_server_worker: + self._node_type = 0 + else: + self._node_type = -1 + elif self._server_worker_mode == 1: + if self._rankid < self._total_server_worker: + if 0 == self._rankid % self._proc_per_node % 2: + self._node_type = 0 + else: + self._node_type = 1 + else: + self._node_type = -1; + else: + self._node_type = -1 + + #if self._rankid == 0: + #print "node type: ", self._node_type + + def _split_comm(self): + if self.is_server(): + self._comm = self.dh.comm.Split(self._node_type) + elif self.is_worker(): + self._comm = self.dh.comm.Split(self._node_type) + pass + + def get_worker_index(self): + if self._server_worker_mode == 0: + return self._rankid == self.server_num + else: + return self._rankid / self._proc_per_node + + def get_server_index(self): + if self._server_worker_mode == 0: + return self.rank_id + else: + return self.rank_id / self._proc_per_node + + def is_worker(self): + return self._node_type == 1 + + def is_server(self): + return self._node_type == 0 + + def is_first_worker(self): + return self.is_worker() and 0 == self.get_worker_index() + + def set_ip(self, ip): + self._ip = ip + + def gather_ips(self): + self._ips = self.dh.comm.allgather(self._ip) + return self._ips + + def get_node_cnt(self): + return self._nodes + + def barrier_all(self): + #print self._rankid, "begin" + #sys.stdout.flush() + self.dh.comm.barrier() + #print self._rankid, "end" + + def barrier_worker(self): + if self.is_worker(): + #print "worker: ", self._rankid, "begin" + #sys.stdout.flush() + self._comm.barrier() + #print "worker: ", self._rankid, "end" + pass + + def finalize(self): + pass + + +if __name__ == "__main__": + instance = PaddlePSInstance(1, 1, 2, 50) + instance.barrier_all() + #print "-----" + #instance.barrier_worker() From 66182abda6e1451fe4719d8a825a0dcd33cf0339 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 10 Dec 2018 03:16:00 +0000 Subject: [PATCH 195/719] add cuda cudnn version check test=develop --- paddle/fluid/platform/device_context.cc | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 146a205832..bd81d4dd1f 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -220,6 +220,40 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." << (cudnn_dso_ver % 100) / 10 << "."; + + { + // Check CUDA/CUDNN version compatiblity + auto local_cuda_version = runtime_version_ / 100; + auto compile_cuda_version = CUDA_VERSION / 100; + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDA " + << compile_cuda_version / 10 << "." << compile_cuda_version % 10 + << ", but CUDA runtime version in your machine is " + << local_cuda_version / 10 << "." << local_cuda_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDA " + "version."; + } + + if (dynload::HasCUDNN()) { + auto local_cudnn_version = cudnn_dso_ver / 100; + auto compile_cudnn_version = CUDNN_VERSION / 100; + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDNN " + << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10 + << ", but CUDNN version in your machine is " + << local_cudnn_version / 10 << "." << local_cudnn_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDNN " + "version."; + } + } + } + callback_manager_.reset(new StreamCallbackManager(stream_)); } From 57557f677476d75a7b251081e97606499255a0c7 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 10 Dec 2018 11:33:00 +0800 Subject: [PATCH 196/719] fix scope in nce and prefetch --- .../operators/distributed/parameter_prefetch.cc | 13 ++++++------- paddle/fluid/operators/nce_op.h | 13 ++++--------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index f6a2d5bbe5..4cdeae81a1 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -102,8 +102,7 @@ static void MergeMultipleVarsIntoOneBySection( const std::string& out_name, const std::vector& out_var_names, const std::vector& height_section, const std::vector>& splited_ids, - const framework::ExecutionContext& context, - const framework::Scope& actual_scope, framework::Scope* scope, + const framework::ExecutionContext& context, framework::Scope* scope, platform::DeviceContext* actual_ctx) { PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); @@ -115,9 +114,9 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = actual_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); auto* out_tensor = - actual_scope.FindVar(out_name)->GetMutable(); + scope.FindVar(out_name)->GetMutable(); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -175,7 +174,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = context.scope().NewScope(); + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -247,8 +246,8 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, scope, &local_scope, &actual_ctx); - context.scope().DeleteScope(&local_scope); + context, &local_scope, &actual_ctx); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 2e51c67401..862064be18 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -170,7 +170,7 @@ class NCEKernel : public framework::OpKernel { auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - auto *ids = local_scope.Var("Ids"); + auto *ids = local_scope.Var("Ids@Local"); auto *x_tensor = ids->GetMutable(); x_tensor->mutable_data( framework::make_ddim({static_cast(labels.size()), 1}), @@ -179,12 +179,10 @@ class NCEKernel : public framework::OpKernel { std::memcpy(x_tensor->data(), labels.data(), labels.size() * sizeof(int64_t)); - local_scope.Var("Weight@Local") - ->GetMutable() - ->mutable_data(context.GetPlace()); + local_scope.Var("Weight@Local"); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids", "Weight@Local", table_names, + operators::distributed::prefetch("Ids@Local", "Weight@Local", table_names, epmap, height_sections, context, &local_scope); #else @@ -207,10 +205,7 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] += result(0); sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } - - if (context.scope().HasKid(&local_scope)) { - context.scope().DeleteScope(&local_scope); - } + context.scope().DeleteScope(&local_scope); } else { auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); From 3a3b5e50085d5384f97e0112ab0bc0c45702fdc4 Mon Sep 17 00:00:00 2001 From: Cheerego <35982308+shanyi15@users.noreply.github.com> Date: Mon, 10 Dec 2018 11:55:10 +0800 Subject: [PATCH 197/719] update Readme to 1.2 test=develop --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 56d6c10c64..c535e9514e 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1) +### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` # Linux CPU @@ -27,9 +27,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.1.0.post87 +pip install paddlepaddle-gpu==1.2.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.1.0.post85 +pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) We appreciate your contributions! From c9a653820bc2bfaa0a47b67916a445ceaa7abdad Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 10 Dec 2018 12:20:34 +0800 Subject: [PATCH 198/719] fix label_pos ,add test_layers.py, test=develop --- paddle/fluid/operators/bpr_loss_op.cc | 35 +++++++++---------- paddle/fluid/operators/bpr_loss_op.h | 18 +++++----- python/paddle/fluid/layers/nn.py | 17 ++++++--- .../fluid/tests/unittests/test_bpr_loss_op.py | 9 +++-- .../fluid/tests/unittests/test_layers.py | 9 +++++ 5 files changed, 51 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 075b1b2c76..9258d7c7e8 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -23,18 +23,17 @@ class BprLossOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("LabelPos"), - "Input(LabelPos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_pos_dims = ctx->GetInputDim("LabelPos"); + auto label_dims = ctx->GetInputDim("Label"); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, label_pos_dims.size(), - "Input(X) and Input(LabelPos) shall have the same rank."); + PADDLE_ENFORCE_EQ(rank, label_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_pos_dims, 0, rank - 1), - "Input(X) and Input(LabelPos) shall have the same shape " + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " "except the last dimension."); auto y_dims = x_dims; @@ -60,25 +59,23 @@ class BprLossGradientOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("LabelPos"), - "Input(LabelPos) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "Input(Y@GRAD) shoudl be not null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@GRAD) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto label_pos_dims = ctx->GetInputDim("LabelPos"); + auto label_dims = ctx->GetInputDim("Label"); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); int rank = x_dims.size(); PADDLE_ENFORCE_EQ(dy_dims.size(), rank, "Input(Y@Grad) and Input(X) should have the same rank."); - PADDLE_ENFORCE_EQ( - label_pos_dims.size(), rank, - "Input(LabelPos) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(label_dims.size(), rank, + "Input(Label) and Input(X) should have the same rank."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_pos_dims, 0, rank - 1), - "The Input(X) and Input(LabelPos) should have the same " + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " "shape except the last dimension."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(dy_dims, 0, rank - 1), @@ -86,8 +83,8 @@ class BprLossGradientOp : public framework::OperatorWithKernel { "shape except the last dimension."); PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, "The last dimension of Input(Y@Grad) should be 1."); - PADDLE_ENFORCE_EQ(label_pos_dims[rank - 1], 1, - " the last dimension of Input(LabelPos) should be 1."); + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, + " the last dimension of Input(Label) should be 1."); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->ShareLoD("X", framework::GradVarName("X")); } @@ -111,7 +108,7 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { "size is equal to the number of classes. This input is a " "real number."); AddInput( - "LabelPos", + "Label", "(Tensor), the tensor which represents the ground truth. It has the " "same shape with 'X' except the last dimension. the last dimension " "size is 1."); @@ -122,7 +119,7 @@ class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Bayesian Personalized Ranking Loss Operator. -This operator belongs to pairwise ranking loss. LabelPos is the desired item. +This operator belongs to pairwise ranking loss. Label is the desired item. The loss at a given point in one session is defined as: $Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index ab68165942..e223be7af8 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -41,17 +41,17 @@ class BprLossOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); - auto* label_pos = ctx.Input("LabelPos"); + auto* label = ctx.Input("Label"); auto* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); int rank = x->dims().size(); Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); - Tensor labels_Pos_2d = framework::ReshapeToMatrix(*label_pos, rank - 1); + Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1); Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); const framework::Tensor* logits = &x_2d; - const framework::Tensor* labels_pos = &labels_Pos_2d; + const framework::Tensor* labels = &labels_2d; framework::Tensor* out = &y_2d; const int step_size = logits->dims()[0]; @@ -59,9 +59,9 @@ class BprLossOpKernel : public framework::OpKernel { const T* logits_data = logits->data(); T* loss_data = out->data(); - const int64_t* label_pos_data = labels_pos->data(); + const int64_t* label_data = labels->data(); for (int i = 0; i < step_size; ++i) { - int lbl_pos = label_pos_data[i]; + int lbl_pos = label_data[i]; PADDLE_ENFORCE_GE(lbl_pos, 0); PADDLE_ENFORCE_LT(lbl_pos, class_num); int index_pos = i * class_num + lbl_pos; @@ -84,7 +84,7 @@ class BprLossGradientOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* dy = ctx.Input(framework::GradVarName("Y")); - auto* label_pos = ctx.Input("LabelPos"); + auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); const int step_size = x->dims()[0]; @@ -92,16 +92,16 @@ class BprLossGradientOpKernel : public framework::OpKernel { T* dx_data = dx->mutable_data(ctx.GetPlace()); const T* dy_data = dy->data(); const T* x_data = x->data(); - const int64_t* label_pos_data = label_pos->data(); + const int64_t* label_data = label->data(); for (size_t sample_id = 0; sample_id < step_size; sample_id++) { for (size_t x_offset = sample_id * num_classes; x_offset < (sample_id + 1) * num_classes; x_offset++) { dx_data[x_offset] = static_cast(0); } - auto p_index = sample_id * num_classes + label_pos_data[sample_id]; + auto p_index = sample_id * num_classes + label_data[sample_id]; for (size_t ni = 0; ni < num_classes; ni++) { - if (label_pos_data[sample_id] == ni) continue; + if (label_data[sample_id] == ni) continue; auto n_index = sample_id * num_classes + ni; auto grad_ = -dy_data[sample_id] / ((num_classes - 1) * diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9233fe130e..04582acf6b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1349,21 +1349,30 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): return out -def bpr_loss(input, label_pos): +def bpr_loss(input, label): """ Bayesian Personalized Ranking Loss Operator. - This operator belongs to pairwise ranking loss. LabelPos is the desired item. + This operator belongs to pairwise ranking loss. Label is the desired item. The loss at a given point in one session is defined as: $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j(https://arxiv.org/abs/1511.06939) + Args: + input (Variable|list): a 2-D tensor with shape [N x D], where N is the + batch size and D is the number of classes. + This input is not probability but logits. + label (Variable|list): the ground truth which is a 2-D tensor. `label` + is a tensor with shape [N x 1]. + Returns: + A 2-D tensor with shape [N x 1], the bpr loss. + Examples: .. code-block:: python - cost = fluid.layers.bpr_loss(input=predict, label_pos=label) + cost = fluid.layers.bpr_loss(input=predict, label=label) """ helper = LayerHelper('bpr_loss', **locals()) @@ -1371,7 +1380,7 @@ def bpr_loss(input, label_pos): helper.append_op( type='bpr_loss', inputs={'X': [input], - 'LabelPos': [label_pos]}, + 'Label': [label]}, outputs={'Y': [out]}) return out diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py index 80916f4a82..c8dc5fbd23 100644 --- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -28,18 +28,17 @@ class TestBprLossOp1(OpTest): batch_size = 40 class_num = 5 X = randomize_probability(batch_size, class_num, dtype='float64') - label_pos = np.random.randint( - 0, class_num, (batch_size, 1), dtype="int64") + label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64") bpr_loss_result = [] for i in range(batch_size): sum = 0.0 for j in range(class_num): - if j == label_pos[i][0]: + if j == label[i][0]: continue - sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label_pos[i][0]]))) + sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]]))) bpr_loss_result.append(-sum / (class_num - 1)) bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64") - self.inputs = {"X": X, "LabelPos": label_pos} + self.inputs = {"X": X, "Label": label} self.outputs = {"Y": bpr_loss} def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index be51fb06a3..10e8bb5a86 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -846,6 +846,15 @@ class TestBook(unittest.TestCase): out = layers.cross_entropy(x, label, False, 4) self.assertIsNotNone(out) + def test_bpr_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[30, 10], dtype="float32") + label = layers.data(name="label", shape=[30, 1], dtype="int32") + out = layers.bpr_loss(x, label) + self.assertIsNotNone(out) + print(str(program)) + def test_expand(self): program = Program() with program_guard(program): From 271c4808220eff0d8d5d4c9456386c922cd5f2d6 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 10 Dec 2018 12:33:48 +0800 Subject: [PATCH 199/719] update API, test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index a61b93af35..3a422dcb33 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -66,7 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) -paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label_pos'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) From 019e8bbed2e5045778a95962d76fcd2c39044cc3 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 10 Dec 2018 05:47:39 +0000 Subject: [PATCH 200/719] fix comments test=develop --- paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 768318fb06..1d0d83d1f3 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -109,6 +109,10 @@ class Pool2dOpConverter : public OpConverter { } if (pool_type == "max") { + // Under ceil mode, the pre_pad and post_pad are used to + // record the the padding size. In some ceil mode cases, + // we do not need padding, so we initialize the two vars to 0. + nvinfer1::DimsHW pre_pad(0, 0); nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { From abf140289fbc84edd160851cb0f63de75dcf2965 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 10 Dec 2018 13:07:35 +0800 Subject: [PATCH 201/719] split selected rows op should always init output selected rows test=develop --- paddle/fluid/operators/split_selected_rows_op.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index af64607faf..1fef2b3d37 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { for (size_t i = 0; i < outs_rows_idx.size(); ++i) { auto rows_idx = outs_rows_idx[i]; outs[i]->set_height(height_sections[i]); + auto dims = x->GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_value()->mutable_data(dims, x->place()); + outs[i]->mutable_rows()->clear(); if (rows_idx.size() > 0) { - auto dims = x->GetCompleteDims(); - dims[0] = rows_idx.size(); - outs[i]->mutable_value()->mutable_data(dims, x->place()); for (auto idx : rows_idx) { outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); } @@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { } } } + PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(), + "rows should has the same size with tensor dim 0"); } } }; From 90c7f9870e76a398f97d1d510c662a75d373881f Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 10 Dec 2018 14:36:52 +0800 Subject: [PATCH 202/719] fix 'name', test=develop --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/layers/nn.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 3a422dcb33..fd4cf92d85 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -66,7 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) -paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 04582acf6b..e25eaaa9fd 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1349,7 +1349,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): return out -def bpr_loss(input, label): +def bpr_loss(input, label, name=None): """ Bayesian Personalized Ranking Loss Operator. @@ -1366,6 +1366,8 @@ def bpr_loss(input, label): This input is not probability but logits. label (Variable|list): the ground truth which is a 2-D tensor. `label` is a tensor with shape [N x 1]. + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. Default: None. Returns: A 2-D tensor with shape [N x 1], the bpr loss. From 9623b45f40b3d382d4db6ee39daff04f1d9d33ab Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Mon, 10 Dec 2018 14:37:24 +0800 Subject: [PATCH 203/719] Remove unnecessary MKLDNN reorder (#14799) When data flow from a MKLDNN OP kernel to a non-MKLDNN OP kernel, data layout transform (via MKLDNN reorder) will occur even when those two OP kernels share same layout. Add code to remove this unnecessary reorder. test=develop --- .../fluid/framework/data_layout_transform.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index c9e3a8ac1d..5467f6d1b2 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, auto out_format = platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); - void* in_data = GetDataFromTensor(in, in_type); - // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - - auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); - auto out_memory = - memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + if (in_format != out_format) { + void* in_data = GetDataFromTensor(in, in_type); + auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - platform::Reorder(in_memory, out_memory); + auto in_memory = + memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + platform::Reorder(in_memory, out_memory); + } else { + out->ShareDataWith(in); + } out->set_layout(out_layout); // reset format since the out tensor will be feed to non-MKLDNN OPkernel out->set_format(memory::format::format_undef); From f5434507f081e02eb37f8f9da17165bd2298348a Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 10 Dec 2018 14:36:37 +0800 Subject: [PATCH 204/719] fix control_flow ops in outs test=develop --- python/paddle/fluid/framework.py | 29 ++++++++++++++++------ python/paddle/fluid/layers/control_flow.py | 14 ++++++----- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b991187d42..f8e3cd3a32 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1040,19 +1040,15 @@ class Block(object): raise ValueError("var %s not in this block" % name) return v - def _var_recursive(self, name): + def _find_var_recursive(self, name): """ Get a Variable by name from this block recursively. Args: name(str): the Variable's name. - Raises: - ValueError: this block and this parent block doesn't - have a Variable with the giving name. - Returns: - Variable: the Variable with the giving name. + Variable: the Variable with the giving name. Or None if not found. """ frontier = list() visited = set() @@ -1078,8 +1074,27 @@ class Block(object): frontier.append(prog.block(cur.forward_block_idx)) visited.add(id(cur)) + return None - raise ValueError("Var {0} is not found recursively".format(name)) + def _var_recursive(self, name): + """ + Get a Variable by name from this block recursively. + + Args: + name(str): the Variable's name. + + Raises: + ValueError: this block and this parent block doesn't + have a Variable with the giving name. + + Returns: + Variable: the Variable with the giving name. + """ + var = self._find_var_recursive(name) + if var: + return var + else: + raise ValueError("Var {0} is not found recursively".format(name)) def all_parameters(self): return list(self.iter_parameters()) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 05138bf945..b7e3968569 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -717,8 +717,9 @@ class While(object): out_vars = [] for inner_out_name in inner_outputs: - if inner_out_name in parent_block.vars: - out_vars.append(parent_block.var(inner_out_name)) + inner_var = parent_block._find_var_recursive(inner_out_name) + if inner_var: + out_vars.append(inner_var) step_scope = parent_block.create_var( type=core.VarDesc.VarType.STEP_SCOPES) @@ -1264,10 +1265,11 @@ class ConditionalBlock(object): if each_name not in input_set ] - out_list = [ - parent_block.var(var_name) for var_name in parent_block.vars - if var_name in intermediate - ] + out_list = [] + for inner_out_name in intermediate: + inner_var = parent_block._find_var_recursive(inner_out_name) + if inner_var: + out_list.append(inner_var) step_scope = parent_block.create_var( type=core.VarDesc.VarType.STEP_SCOPES) From edd1f5a92b4a96c560e80f477556ee5ef820ac2b Mon Sep 17 00:00:00 2001 From: superjomn Date: Mon, 10 Dec 2018 15:17:10 +0800 Subject: [PATCH 205/719] fix visualizer test=develop --- .../inference/analysis/passes/ir_graph_build_pass.cc | 7 ++++--- paddle/fluid/inference/utils/visualizer.cc | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index b8a045c18f..c6e923c004 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -44,9 +44,10 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { argument->SetMainProgram(program.release()); } else if (argument->model_program_path_valid() && argument->model_params_path_valid()) { - auto program = - LoadModel(argument->model_program_path(), argument->model_params_path(), - argument->scope_ptr(), place, argument->model_from_memory()); + auto program = LoadModel( + argument->model_program_path(), argument->model_params_path(), + argument->scope_ptr(), place, + argument->model_from_memory_valid() && argument->model_from_memory()); argument->SetMainProgram(program.release()); } else { PADDLE_THROW( diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc index 040b6476fb..7c0dd64dea 100644 --- a/paddle/fluid/inference/utils/visualizer.cc +++ b/paddle/fluid/inference/utils/visualizer.cc @@ -26,9 +26,6 @@ DEFINE_string(model_dir, "", "model directory"); DEFINE_string(model_program_path, "", "model program path"); DEFINE_string(model_params_path, "", "model params path"); -USE_PASS(graph_viz_pass); -USE_PASS(graph_to_program_pass); - using paddle::inference::analysis::Argument; namespace paddle { @@ -40,7 +37,6 @@ void Visualizer::SetArgument(Argument *argument) { argument_ = argument; } bool Visualizer::Run() { paddle::framework::InitDevices(false); paddle::inference::analysis::Analyzer().Run(argument_); - return true; } @@ -77,7 +73,7 @@ int main(int argc, char *argv[]) { // Only 1 pass, default filename is 0_ir_origin.dot // For more details, looking for paddle::inference::analysis::IRPassManager - argument.SetIrAnalysisPasses({"graph_viz_pass"}); + argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"}); std::unique_ptr scope{ new paddle::framework::Scope()}; @@ -90,3 +86,7 @@ int main(int argc, char *argv[]) { return 0; } + +USE_PASS(infer_clean_graph_pass); +USE_PASS(graph_viz_pass); +USE_PASS(graph_to_program_pass); From 554bcdbdfcbe68c799dd6de8a01ab0d2337f2975 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Mon, 10 Dec 2018 15:17:49 +0800 Subject: [PATCH 206/719] add more log for dist test for ci test=develop (#14813) * add more log for dist test for ci test=develop * increase deadline test=develop --- .../fluid/tests/unittests/test_dist_base.py | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 0a43f53658..26fa20291b 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -378,6 +378,18 @@ class TestDistBase(unittest.TestCase): stderr=tr1_pipe, env=env1) + # Wait until trainer process terminate + while True: + stat0 = tr0_proc.poll() + time.sleep(0.1) + if stat0 is not None: + break + while True: + stat1 = tr1_proc.poll() + time.sleep(0.1) + if stat1 is not None: + break + tr0_out, tr0_err = tr0_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate() @@ -390,11 +402,21 @@ class TestDistBase(unittest.TestCase): ps0.terminate() ps1.terminate() + # print server log + with open("/tmp/ps0_err.log", "r") as fn: + sys.stderr.write("ps0 stderr: %s\n" % fn.read()) + with open("/tmp/ps1_err.log", "r") as fn: + sys.stderr.write("ps1 stderr: %s\n" % fn.read()) + # print log - sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) - sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) - sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) - sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + if stat0 == 0: + sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) + with open("/tmp/tr0_err.log", "r") as fn: + sys.stderr.write('trainer 0 stderr: %s\n' % fn.read()) + if stat1 == 0: + sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) + with open("/tmp/tr1_err.log", "r") as fn: + sys.stderr.write('trainer 1 stderr: %s\n' % fn.read()) return pickle.loads(tr0_out), pickle.loads(tr1_out) @@ -474,6 +496,7 @@ class TestDistBase(unittest.TestCase): "PYTHONPATH": os.getenv("PYTHONPATH", ""), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_rpc_deadline": "5000", # 5sec to fail fast "FLAGS_cudnn_deterministic": "1", "http_proxy": "", "NCCL_P2P_DISABLE": "1" From 53709e7e619dbcf243a7421777c103e69c5012ee Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 6 Dec 2018 13:01:38 +0000 Subject: [PATCH 207/719] refine names --- paddle/fluid/operators/CMakeLists.txt | 4 +- .../{jitkernels => jit}/CMakeLists.txt | 6 +-- .../operators/{jitkernels => jit}/README.md | 2 +- .../jitcode => jit/gen}/CMakeLists.txt | 0 .../{jitkernels/jitcode => jit/gen}/blas.cc | 17 ++++---- .../{jitkernels/jitcode => jit/gen}/blas.h | 10 ++--- .../jitcode => jit/gen}/jitcode.cc | 10 ++--- .../{jitkernels/jitcode => jit/gen}/jitcode.h | 12 +++--- .../jitcode_base.cc => jit/gen_base.cc} | 8 ++-- .../jitcode_base.h => jit/gen_base.h} | 10 ++--- .../{jitkernels => jit}/kernel_base.h | 4 +- .../{jitkernels => jit}/kernel_key.h | 6 +-- .../{jitkernels => jit}/kernel_pool.cc | 6 +-- .../{jitkernels => jit}/kernel_pool.h | 20 ++++----- .../{jitkernels => jit}/more/CMakeLists.txt | 0 .../more/mkl/CMakeLists.txt | 0 .../{jitkernels => jit}/more/mkl/mkl.cc | 10 ++--- .../{jitkernels => jit}/more/mkl/mkl.h | 6 +-- .../operators/{jitkernels => jit}/more/more.h | 0 .../{jitkernels => jit}/refer/CMakeLists.txt | 0 .../{jitkernels => jit}/refer/refer.cc | 6 +-- .../{jitkernels => jit}/refer/refer.h | 6 +-- .../operators/{jitkernels => jit}/registry.h | 42 +++++++++---------- .../operators/{jitkernels => jit}/test.cc | 6 +-- 24 files changed, 96 insertions(+), 95 deletions(-) rename paddle/fluid/operators/{jitkernels => jit}/CMakeLists.txt (79%) rename paddle/fluid/operators/{jitkernels => jit}/README.md (98%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/blas.cc (90%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/blas.h (93%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/jitcode.cc (79%) rename paddle/fluid/operators/{jitkernels/jitcode => jit/gen}/jitcode.h (94%) rename paddle/fluid/operators/{jitkernels/jitcode_base.cc => jit/gen_base.cc} (88%) rename paddle/fluid/operators/{jitkernels/jitcode_base.h => jit/gen_base.h} (90%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_base.h (96%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_key.h (93%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_pool.cc (90%) rename paddle/fluid/operators/{jitkernels => jit}/kernel_pool.h (89%) rename paddle/fluid/operators/{jitkernels => jit}/more/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels => jit}/more/mkl/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels => jit}/more/mkl/mkl.cc (84%) rename paddle/fluid/operators/{jitkernels => jit}/more/mkl/mkl.h (92%) rename paddle/fluid/operators/{jitkernels => jit}/more/more.h (100%) rename paddle/fluid/operators/{jitkernels => jit}/refer/CMakeLists.txt (100%) rename paddle/fluid/operators/{jitkernels => jit}/refer/refer.cc (81%) rename paddle/fluid/operators/{jitkernels => jit}/refer/refer.h (91%) rename paddle/fluid/operators/{jitkernels => jit}/registry.h (86%) rename paddle/fluid/operators/{jitkernels => jit}/test.cc (95%) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 2808133e84..16ef5c9524 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -16,7 +16,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) -add_subdirectory(jitkernels) +add_subdirectory(jit) if(WITH_DISTRIBUTE) add_subdirectory(distributed) @@ -68,7 +68,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_ten if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt similarity index 79% rename from paddle/fluid/operators/jitkernels/CMakeLists.txt rename to paddle/fluid/operators/jit/CMakeLists.txt index f6bb3e0712..77fd27666f 100644 --- a/paddle/fluid/operators/jitkernels/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -14,8 +14,8 @@ cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) add_subdirectory(refer) add_subdirectory(more) if(WITH_XBYAK) - add_subdirectory(jitcode) + add_subdirectory(gen) endif() -cc_library(jit_kernel SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) -cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel) +cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) diff --git a/paddle/fluid/operators/jitkernels/README.md b/paddle/fluid/operators/jit/README.md similarity index 98% rename from paddle/fluid/operators/jitkernels/README.md rename to paddle/fluid/operators/jit/README.md index fd6428b43e..12158bf9d0 100644 --- a/paddle/fluid/operators/jitkernels/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -13,7 +13,7 @@ PaddlePaddle/Paddle/paddle/fluid/ │ ├── .../ └── jit/ ├── ... - ├── jitcode/ + ├── gen/ │ └── ... |── more/ │ ├── ... diff --git a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt rename to paddle/fluid/operators/jit/gen/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc similarity index 90% rename from paddle/fluid/operators/jitkernels/jitcode/blas.cc rename to paddle/fluid/operators/jit/gen/blas.cc index 2691bee0fd..4a8b4554c8 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -11,13 +11,14 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/jitcode/blas.h" -#include "paddle/fluid/operators/jitkernels/registry.h" + +#include "paddle/fluid/operators/jit/gen/blas.h" +#include "paddle/fluid/operators/jit/registry.h" namespace paddle { namespace operators { -namespace jitkernels { -namespace jitcode { +namespace jit { +namespace gen { void VXXJitCode::genCode() { // do not need push stack, and do not need save avx512reg if do not use avx512 @@ -102,17 +103,17 @@ void VXXJitCode::genCode() { ret(); } -} // namespace jitcode +} // namespace gen template <> -std::unique_ptr CreateJitCode(int attr) { +std::unique_ptr CreateJitCode(int attr) { if (UseJitCode(attr)) { - return make_unique( + return make_unique( attr, CodeSize(attr)); } return nullptr; } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.h b/paddle/fluid/operators/jit/gen/blas.h similarity index 93% rename from paddle/fluid/operators/jitkernels/jitcode/blas.h rename to paddle/fluid/operators/jit/gen/blas.h index a1aca97723..edc05f86a0 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -15,12 +15,12 @@ #pragma once #include -#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" namespace paddle { namespace operators { -namespace jitkernels { -namespace jitcode { +namespace jit { +namespace gen { // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) class VXXJitCode : public JitCode { @@ -82,7 +82,7 @@ class VMulJitCode : public VXXJitCode { : VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {} }; -} // namespace jitcode -} // namespace jitkernels +} // namespace gen +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc b/paddle/fluid/operators/jit/gen/jitcode.cc similarity index 79% rename from paddle/fluid/operators/jitkernels/jitcode/jitcode.cc rename to paddle/fluid/operators/jit/gen/jitcode.cc index 8078ace7a8..93204d340e 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.cc +++ b/paddle/fluid/operators/jit/gen/jitcode.cc @@ -12,11 +12,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { template <> size_t GetKey(int d) { @@ -24,15 +24,15 @@ size_t GetKey(int d) { } // template <> -// std::shared_ptr CreateJitCode(int attr) +// std::shared_ptr CreateJitCode(int attr) // { // if (UseJitCode(attr)) { -// return std::make_shared>(attr, +// return std::make_shared>(attr, // CodeSize(attr))); // } // return nullptr; // } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h similarity index 94% rename from paddle/fluid/operators/jitkernels/jitcode/jitcode.h rename to paddle/fluid/operators/jit/gen/jitcode.h index 03c2100ca0..52b8da9a82 100644 --- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -15,7 +15,7 @@ #pragma once #include -#include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/platform/cpu_info.h" #define XBYAK_USE_MMAP_ALLOCATOR @@ -24,8 +24,8 @@ namespace paddle { namespace operators { -namespace jitkernels { -namespace jitcode { +namespace jit { +namespace gen { // Application Binary Interface constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), @@ -67,7 +67,7 @@ typedef enum { #define DECLARE_JIT_CODE(codename) \ const char* name() const override { return #codename; } -class JitCode : public JitBase, public Xbyak::CodeGenerator { +class JitCode : public GenBase, public Xbyak::CodeGenerator { public: explicit JitCode(size_t code_size, void* code_ptr = nullptr) : Xbyak::CodeGenerator(code_size, code_ptr) { @@ -128,7 +128,7 @@ class JitCode : public JitBase, public Xbyak::CodeGenerator { } }; -} // namespace jitcode -} // namespace jitkernels +} // namespace gen +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.cc b/paddle/fluid/operators/jit/gen_base.cc similarity index 88% rename from paddle/fluid/operators/jitkernels/jitcode_base.cc rename to paddle/fluid/operators/jit/gen_base.cc index 1da2af51f4..310da0c76f 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -12,7 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/jitcode_base.h" +#include "paddle/fluid/operators/jit/gen_base.h" #include #include #include @@ -21,10 +21,10 @@ DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { // refer do not need useme, it would be the last one. -void JitBase::dumpCode(const unsigned char* code) const { +void GenBase::dumpCode(const unsigned char* code) const { if (code) { static int counter = 0; std::ostringstream filename; @@ -38,6 +38,6 @@ void JitBase::dumpCode(const unsigned char* code) const { } } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jit/gen_base.h similarity index 90% rename from paddle/fluid/operators/jitkernels/jitcode_base.h rename to paddle/fluid/operators/jit/gen_base.h index de8aaf229f..4a136534dc 100644 --- a/paddle/fluid/operators/jitkernels/jitcode_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -16,14 +16,14 @@ #include #include // for shared_ptr -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/macros.h" DECLARE_bool(dump_jitcode); namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { // TODO(TJ): make these functions as virtual of a class @@ -43,7 +43,7 @@ bool UseJitCode(Attr attr) { template size_t GetKey(Attr attr); -class JitBase : public Kernel { +class GenBase : public Kernel { public: virtual const char* name() const = 0; virtual const unsigned char* getCodeInternal() = 0; @@ -62,8 +62,8 @@ class JitBase : public Kernel { }; template -std::unique_ptr CreateJitCode(Attr attr); +std::unique_ptr CreateJitCode(Attr attr); -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h similarity index 96% rename from paddle/fluid/operators/jitkernels/kernel_base.h rename to paddle/fluid/operators/jit/kernel_base.h index 6fbb0f9f7e..6a789c52c3 100644 --- a/paddle/fluid/operators/jitkernels/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -17,7 +17,7 @@ namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; @@ -54,6 +54,6 @@ class ReferKernel : public KernelImpl { bool UseMe(Attr attr) const override { return true; } }; -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h similarity index 93% rename from paddle/fluid/operators/jitkernels/kernel_key.h rename to paddle/fluid/operators/jit/kernel_key.h index e06c2b58da..af9df77337 100644 --- a/paddle/fluid/operators/jitkernels/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -13,12 +13,12 @@ * limitations under the License. */ #pragma once -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { struct KernelKey { struct Hash { @@ -44,6 +44,6 @@ struct KernelKey { bool operator!=(const KernelKey& o) const { return !(*this == o); } }; -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc similarity index 90% rename from paddle/fluid/operators/jitkernels/kernel_pool.cc rename to paddle/fluid/operators/jit/kernel_pool.cc index 9bb0ba349b..f300d28a6f 100644 --- a/paddle/fluid/operators/jitkernels/kernel_pool.cc +++ b/paddle/fluid/operators/jit/kernel_pool.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/kernel_pool.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" #include // for shared_ptr #include #include namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { KernelPool& KernelPool::Instance() { static KernelPool g_kernel_pool; @@ -31,6 +31,6 @@ ReferKernelPool& ReferKernelPool::Instance() { return g_refer_kernel_pool; } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h similarity index 89% rename from paddle/fluid/operators/jitkernels/kernel_pool.h rename to paddle/fluid/operators/jit/kernel_pool.h index 901a891cb3..737b7f60e3 100644 --- a/paddle/fluid/operators/jitkernels/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -18,19 +18,19 @@ #include #include #include -#include "paddle/fluid/operators/jitkernels/jitcode_base.h" -#include "paddle/fluid/operators/jitkernels/kernel_base.h" -#include "paddle/fluid/operators/jitkernels/kernel_key.h" +#include "paddle/fluid/operators/jit/gen_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_key.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { template class JitCodePool { - typedef std::unique_ptr JitBasePtr; - typedef std::unordered_map JitBaseMap; + typedef std::unique_ptr GenBasePtr; + typedef std::unordered_map JitCodeMap; public: JitCodePool() = default; @@ -39,16 +39,16 @@ class JitCodePool { return g_jit_codes; } - const JitBaseMap& AllKernels() { return codes_; } + const JitCodeMap& AllKernels() { return codes_; } bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, JitBasePtr value) { + void Insert(size_t key, GenBasePtr value) { codes_.emplace(key, std::move(value)); } private: - JitBaseMap codes_; + JitCodeMap codes_; DISABLE_COPY_AND_ASSIGN(JitCodePool); }; @@ -146,6 +146,6 @@ const Func Get(Attr attr) { return GetRefer(); } -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/more/CMakeLists.txt rename to paddle/fluid/operators/jit/more/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/more/mkl/CMakeLists.txt rename to paddle/fluid/operators/jit/more/mkl/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc similarity index 84% rename from paddle/fluid/operators/jitkernels/more/mkl/mkl.cc rename to paddle/fluid/operators/jit/more/mkl/mkl.cc index 88a7d66194..0ffe1d565f 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -12,13 +12,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/more/mkl/mkl.h" -#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/operators/jit/more/mkl/mkl.h" +#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/dynload/mklml.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { namespace more { namespace mkl { @@ -34,11 +34,11 @@ void VMul(const double* x, const double* y, double* z, int n) { } // namespace mkl } // namespace more -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle -namespace mkl = paddle::operators::jitkernels::more::mkl; +namespace mkl = paddle::operators::jit::more::mkl; REGISTER_JITKERNEL_MORE(vmul, mkl, mkl::VMulKernel, mkl::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h similarity index 92% rename from paddle/fluid/operators/jitkernels/more/mkl/mkl.h rename to paddle/fluid/operators/jit/more/mkl/mkl.h index 9cf032db43..45cfec1c47 100644 --- a/paddle/fluid/operators/jitkernels/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -15,12 +15,12 @@ #pragma once #include -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { namespace more { namespace mkl { @@ -43,6 +43,6 @@ class VMulKernel : public KernelImpl::func_type, } // namespace mkl } // namespace more -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/more/more.h b/paddle/fluid/operators/jit/more/more.h similarity index 100% rename from paddle/fluid/operators/jitkernels/more/more.h rename to paddle/fluid/operators/jit/more/more.h diff --git a/paddle/fluid/operators/jitkernels/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/jitkernels/refer/CMakeLists.txt rename to paddle/fluid/operators/jit/refer/CMakeLists.txt diff --git a/paddle/fluid/operators/jitkernels/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc similarity index 81% rename from paddle/fluid/operators/jitkernels/refer/refer.cc rename to paddle/fluid/operators/jit/refer/refer.cc index dbccac896c..a987b5fca0 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -12,10 +12,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/jitkernels/refer/refer.h" -#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/operators/jit/refer/refer.h" +#include "paddle/fluid/operators/jit/registry.h" -namespace refer = paddle::operators::jitkernels::refer; +namespace refer = paddle::operators::jit::refer; REGISTER_JITKERNEL_REFER(vmul, refer::VMulKernel, refer::VMulKernel); diff --git a/paddle/fluid/operators/jitkernels/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h similarity index 91% rename from paddle/fluid/operators/jitkernels/refer/refer.h rename to paddle/fluid/operators/jit/refer/refer.h index 796f58d401..76a663633d 100644 --- a/paddle/fluid/operators/jitkernels/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -13,12 +13,12 @@ * limitations under the License. */ #pragma once -#include "paddle/fluid/operators/jitkernels/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { namespace refer { template @@ -36,6 +36,6 @@ class VMulKernel : public ReferKernel::func_type, }; } // namespace refer -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/registry.h b/paddle/fluid/operators/jit/registry.h similarity index 86% rename from paddle/fluid/operators/jitkernels/registry.h rename to paddle/fluid/operators/jit/registry.h index 6d817461be..c1f02d9cd5 100644 --- a/paddle/fluid/operators/jitkernels/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -17,14 +17,14 @@ #include #include #include -#include "paddle/fluid/operators/jitkernels/kernel_base.h" -#include "paddle/fluid/operators/jitkernels/kernel_pool.h" +#include "paddle/fluid/operators/jit/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/variant.h" // for UNUSED namespace paddle { namespace operators { -namespace jitkernels { +namespace jit { // make_unique is supported since c++14 template @@ -76,21 +76,21 @@ class JitKernelRegistrar { msg) // Refer always on CPUPlace -#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ - "REGISTER_KERNEL_REFER must be called in global namespace"); \ - static ::paddle::operators::jitkernels::JitKernelRegistrar< \ - ::paddle::operators::jitkernels::ReferKernelPool, \ - ::paddle::platform::CPUPlace, __VA_ARGS__> \ - __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ - ::paddle::operators::jitkernels::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ - __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ - return 0; \ +#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ + "REGISTER_KERNEL_REFER must be called in global namespace"); \ + static ::paddle::operators::jit::JitKernelRegistrar< \ + ::paddle::operators::jit::ReferKernelPool, ::paddle::platform::CPUPlace, \ + __VA_ARGS__> \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ + ::paddle::operators::jit::KernelType::kernel_type); \ + int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ + __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ + return 0; \ } -// kernel_type: should be in paddle::operators::jitkernels::KernelType +// kernel_type: should be in paddle::operators::jit::KernelType // place_type: should be one of CPUPlace and GPUPlace in paddle::platform #define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ @@ -99,11 +99,11 @@ class JitKernelRegistrar { extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \ UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static ::paddle::operators::jitkernels::JitKernelRegistrar< \ - ::paddle::operators::jitkernels::KernelPool, \ - ::paddle::platform::place_type, __VA_ARGS__> \ + static ::paddle::operators::jit::JitKernelRegistrar< \ + ::paddle::operators::jit::KernelPool, ::paddle::platform::place_type, \ + __VA_ARGS__> \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \ - ::paddle::operators::jitkernels::KernelType::kernel_type); \ + ::paddle::operators::jit::KernelType::kernel_type); \ int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \ .Touch(); \ @@ -139,6 +139,6 @@ class JitKernelRegistrar { #define USE_JITKERNEL_MORE(kernel_type, impl_type) \ USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace) -} // namespace jitkernels +} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jitkernels/test.cc b/paddle/fluid/operators/jit/test.cc similarity index 95% rename from paddle/fluid/operators/jitkernels/test.cc rename to paddle/fluid/operators/jit/test.cc index d27b5d1cba..836b6eee80 100644 --- a/paddle/fluid/operators/jitkernels/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -19,9 +19,9 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/jitkernels/kernel_pool.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" // TODO(TJ): remove me -#include "paddle/fluid/operators/jitkernels/registry.h" +#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" @@ -66,7 +66,7 @@ TEST(JitKernel, vmul) { using T = float; using PlaceType = paddle::platform::CPUPlace; - namespace jit = paddle::operators::jitkernels; + namespace jit = paddle::operators::jit; // TODO(TJ): test more vector size for (int d = 1; d < 30; ++d) { auto ref = jit::GetRefer::func_type, From afc51e6f8272aef314e3bd791195fc887687aacf Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 10 Dec 2018 07:37:01 +0000 Subject: [PATCH 208/719] add benchmark for trt --- paddle/fluid/inference/tests/api/CMakeLists.txt | 3 ++- paddle/fluid/inference/tests/api/tester_helper.h | 16 ++++++++++++++-- .../inference/tests/api/trt_models_tester.cc | 3 +++ paddle/fluid/inference/utils/benchmark.cc | 2 +- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index a07626a103..6901aac3c3 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,7 +1,8 @@ set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) if(WITH_GPU AND TENSORRT_FOUND) - set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor) + set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} +ir_pass_manager analysis_predictor benchmark) endif() function(download_model install_dir model_name) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d572ea0177..8209a049f4 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -30,8 +30,10 @@ #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/inference/utils/benchmark.h" #include "paddle/fluid/platform/profiler.h" +DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); DEFINE_int32(batch_size, 1, "batch size."); @@ -40,6 +42,8 @@ DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); +DEFINE_bool(record_benchmark, false, + "Record benchmark after profiling the model"); DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); @@ -192,8 +196,16 @@ void TestOneThreadPrediction( predictor->Run(inputs[j], outputs, batch_size); } } - PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times, - inputs.size()); + + double latency = run_timer.toc() / num_times; + PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(batch_size); + benchmark.SetLatency(latency); + benchmark.PersistToFile("benchmark_record.txt"); + } } } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index ef612ce614..9eb3fb5da1 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -135,6 +135,9 @@ TEST(TensorRT_resnext50, compare) { TEST(TensorRT_resnext50, profile) { std::string model_dir = FLAGS_infer_model + "/resnext50"; + // Set FLAGS_record_benchmark to true to record benchmark to file. + // FLAGS_record_benchmark=true; + FLAGS_model_name = "resnext50"; profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); } diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc index d03aa11b75..0bd526bcac 100644 --- a/paddle/fluid/inference/utils/benchmark.cc +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -30,7 +30,7 @@ std::string Benchmark::SerializeToString() const { ss << '\n'; ss << name_ << "\t"; - ss << batch_size_ << "\t"; + ss << batch_size_ << "\t\t"; ss << num_threads_ << "\t"; ss << latency_ << "\t"; ss << 1000.0 / latency_; From b22d638d8fa9168a49dccffef379218eb8c85c92 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 10 Dec 2018 16:18:06 +0800 Subject: [PATCH 209/719] Speed up SizeOfType test=develop --- paddle/fluid/framework/data_type.cc | 15 ++-- paddle/fluid/framework/parallel_executor.cc | 10 +-- paddle/fluid/platform/enforce.h | 80 +++++++++++---------- python/paddle/fluid/__init__.py | 3 +- 4 files changed, 57 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 28f3da88fa..1c29a89bff 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/data_type.h" #include +#include #include #include @@ -23,10 +24,10 @@ namespace paddle { namespace framework { struct DataTypeMap { - std::unordered_map cpp_to_proto_; + std::map cpp_to_proto_; std::unordered_map proto_to_cpp_; std::unordered_map proto_to_str_; - std::unordered_map cpp_to_size_; + std::map cpp_to_size_; }; static DataTypeMap* InitDataTypeMap(); @@ -43,9 +44,9 @@ static inline void RegisterType(DataTypeMap* map, proto::VarType::Type proto_type, const std::string& name) { map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); - map->cpp_to_proto_.emplace(typeid(T), proto_type); + map->cpp_to_proto_.emplace(typeid(T).name(), proto_type); map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(typeid(T), sizeof(T)); + map->cpp_to_size_.emplace(typeid(T).name(), sizeof(T)); } static DataTypeMap* InitDataTypeMap() { @@ -71,7 +72,7 @@ static DataTypeMap* InitDataTypeMap() { } proto::VarType::Type ToDataType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_proto_.find(type); + auto it = gDataTypeMap().cpp_to_proto_.find(type.name()); if (it != gDataTypeMap().cpp_to_proto_.end()) { return it->second; } @@ -97,8 +98,8 @@ std::string DataTypeToString(const proto::VarType::Type type) { } size_t SizeOfType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_size_.find(type); - if (it != gDataTypeMap().cpp_to_size_.end()) { + auto it = gDataTypeMap().cpp_to_size_.find(type.name()); + if (LIKELY(it != gDataTypeMap().cpp_to_size_.end())) { return it->second; } PADDLE_THROW("Not support %s as tensor type", type.name()); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 9355bb572b..0636b89048 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -31,9 +31,9 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #ifdef WITH_GPERFTOOLS -#include "google/gperftools.h" +#include "gperftools/profiler.h" #endif -DEFINE_string(PEProfileFName, "", +DEFINE_string(pe_profile_fname, "", "Profiler filename for PE, which generated by gperftools." "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); @@ -45,14 +45,14 @@ class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) : places_(places) { - if (!FLAGS_PEProfileFName.empty()) { + if (!FLAGS_pe_profile_fname.empty()) { std::call_once(gProfileOnce, [] { #ifdef WITH_GPERFTOOLS - ProfilerStart(FLAGS_PEProfileFName.c_str()); + ProfilerStart(FLAGS_pe_profile_fname.c_str()); gProfileStarted = true; #else LOG(WARNING) << "Paddle is not compiled with gperftools. " - "FLAGS_PEProfileFName will be ignored"; + "FLAGS_pe_profile_fname will be ignored"; #endif }); } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a85972bdb7..01ee67fd07 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -62,45 +62,54 @@ inline std::string demangle(std::string name) { return name; } #endif struct EnforceNotMet : public std::exception { - std::exception_ptr exp_; std::string err_str_; - EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) { - static constexpr int TRACE_STACK_LIMIT = 100; + EnforceNotMet(std::exception_ptr e, const char* f, int l) { try { - std::rethrow_exception(exp_); - } catch (const std::exception& exp) { - std::ostringstream sout; + std::rethrow_exception(e); + } catch (std::exception& e) { + Init(e.what(), f, l); + } + } - sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl; - sout << "PaddlePaddle Call Stacks: " << std::endl; + template + EnforceNotMet(const char* f, int l, ARGS... args) { + Init(string::Sprintf(args...), f, l); + } + + const char* what() const noexcept override { return err_str_.c_str(); } + + private: + template + inline void Init(StrType what, const char* f, int l) { + static constexpr int TRACE_STACK_LIMIT = 100; + std::ostringstream sout; + + sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl; + sout << "PaddlePaddle Call Stacks: " << std::endl; #if !defined(_WIN32) - void* call_stack[TRACE_STACK_LIMIT]; - auto size = backtrace(call_stack, TRACE_STACK_LIMIT); - auto symbols = backtrace_symbols(call_stack, size); - - Dl_info info; - for (int i = 0; i < size; ++i) { - if (dladdr(call_stack[i], &info) && info.dli_sname) { - auto demangled = demangle(info.dli_sname); - auto addr_offset = static_cast(call_stack[i]) - - static_cast(info.dli_saddr); - sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, - 2 + sizeof(void*) * 2, call_stack[i], - demangled, addr_offset); - } else { - sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, - call_stack[i]); - } + void* call_stack[TRACE_STACK_LIMIT]; + auto size = backtrace(call_stack, TRACE_STACK_LIMIT); + auto symbols = backtrace_symbols(call_stack, size); + Dl_info info; + for (int i = 0; i < size; ++i) { + if (dladdr(call_stack[i], &info) && info.dli_sname) { + auto demangled = demangle(info.dli_sname); + auto addr_offset = static_cast(call_stack[i]) - + static_cast(info.dli_saddr); + sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, + 2 + sizeof(void*) * 2, call_stack[i], demangled, + addr_offset); + } else { + sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, + call_stack[i]); } - free(symbols); + } + free(symbols); #else - sout << "Windows not support stack backtrace yet."; + sout << "Windows not support stack backtrace yet."; #endif - err_str_ = sout.str(); - } + err_str_ = sout.str(); } - - const char* what() const noexcept { return err_str_.c_str(); } }; struct EOFException : public std::exception { @@ -242,13 +251,8 @@ inline void throw_on_error(T e) { throw_on_error(e, ""); } -#define PADDLE_THROW(...) \ - do { \ - throw ::paddle::platform::EnforceNotMet( \ - std::make_exception_ptr( \ - std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \ - __FILE__, __LINE__); \ - } while (false) +#define PADDLE_THROW(...) \ + throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) #ifndef REPLACE_ENFORCE_GLOG #define PADDLE_ENFORCE(...) \ diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 52417a1eaf..a532f94c6d 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -127,7 +127,8 @@ def __bootstrap__(): 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'reader_queue_speed_test_mode', 'print_sub_graph_dir', + 'pe_profile_fname' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') From 202b2f1fa71b33b5165e166ecdde0163a9799bdb Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 10 Dec 2018 17:27:20 +0800 Subject: [PATCH 210/719] Move the beta pow scale calculation into Adam Op --- paddle/fluid/framework/ir/graph.cc | 98 ++++++++++----------- paddle/fluid/operators/optimizers/adam_op.h | 17 ++++ python/paddle/fluid/optimizer.py | 43 ++++----- 3 files changed, 88 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bba..dfa310a386 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -28,55 +28,55 @@ namespace { void CheckProgram(const ProgramDesc &program) { #define _INT(role) static_cast(role) - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } +// std::map visit; +// for (OpDesc *op : program.Block(0).AllOps()) { +// // For backward compatibility, some program doesn't have role added. +// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; +// int role_id = +// boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); +// visit[role_id] = true; +// switch (role_id) { +// case _INT(OpRole::kForward): +// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { +// LOG(ERROR) +// << "Cannot add backward operator before forward operator %s." +// << op->Type(); +// } +// break; +// case _INT(OpRole::kBackward): +// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): +// PADDLE_ENFORCE( +// visit.find(_INT(OpRole::kOptimize)) == visit.end(), +// "Cannot add backward operator %s after optimize operator.", +// op->Type()); +// break; +// case _INT(OpRole::kForward) | _INT(OpRole::kLoss): +// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | +// _INT(OpRole::kLoss)) == visit.end(), +// "Cannot add backward|loss operator before " +// "forward|loss operator %s.", +// op->Type()); +// PADDLE_ENFORCE( +// visit.find(_INT(OpRole::kOptimize)) == visit.end(), +// "Cannot add forward|loss operator %s after optimize operator.", +// op->Type()); +// break; +// case _INT(OpRole::kOptimize): +// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): +// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), +// "Optimize operators %s must follow backward operator.", +// op->Type()); +// break; +// case _INT(OpRole::kLRSched): +// case _INT(OpRole::kDist): +// case _INT(OpRole::kRPC): +// case _INT(OpRole::kNotSpecified): +// break; +// default: +// LOG(FATAL) << "Unknown operator role. Don't add new role because " +// "you don't know what you are doing."; +// } +// } #undef _INT } diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54..2205f473f2 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -292,6 +292,23 @@ class AdamOpKernel : public framework::OpKernel { static_cast(ctx.device_context()), param.numel()); for_range(functor); + + auto& dev = + *ctx.template device_context().eigen_device(); + + const LoDTensor* beta1_pow_ptr = ctx.Input("Beta1Pow"); + auto eigen_in_beta1_pow = + framework::EigenVector::Flatten(*beta1_pow_ptr); + auto eigen_out_beta1_pow = framework::EigenVector::Flatten( + *(const_cast(beta1_pow_ptr))); + eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow; + + const LoDTensor* beta2_pow_ptr = ctx.Input("Beta2Pow"); + auto eigen_in_beta2_pow = + framework::EigenVector::Flatten(*beta2_pow_ptr); + auto eigen_out_beta2_pow = framework::EigenVector::Flatten( + *(const_cast(beta2_pow_ptr))); + eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow; } } else if (grad_var->IsType()) { auto& grad = diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index da92826d41..1930ac106b 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python @@ -739,26 +739,27 @@ class AdamOptimizer(Optimizer): """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - for param, grad in param_and_grads: - if grad is None: - continue - with param.block.program._optimized_guard( - [param, grad]), name_scope("optimizer"): - beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - param) - beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - param) - main_block.append_op( - type="scale", - inputs={"X": beta1_pow_acc}, - outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}) - - main_block.append_op( - type="scale", - inputs={"X": beta2_pow_acc}, - outputs={"Out": beta2_pow_acc}, - attrs={"scale": self._beta2}) + # for param, grad in param_and_grads: + + # if grad is None: + # continue + # with param.block.program._optimized_guard( + # [param, grad]), name_scope("optimizer"): + # beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + # param) + # beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + # param) + # main_block.append_op( + # type="scale", + # inputs={"X": beta1_pow_acc}, + # outputs={"Out": beta1_pow_acc}, + # attrs={"scale": self._beta1}) + + # main_block.append_op( + # type="scale", + # inputs={"X": beta2_pow_acc}, + # outputs={"Out": beta2_pow_acc}, + # attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): From 79082c94594adaf4765e950151da51c84ec137b8 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 10 Dec 2018 17:31:52 +0800 Subject: [PATCH 211/719] fix pyreader failed --- .../scope_buffered_ssa_graph_executor.cc | 27 +++++++++---------- .../scope_buffered_ssa_graph_executor.h | 5 ++-- .../details/threaded_ssa_graph_executor.cc | 1 - paddle/fluid/framework/parallel_executor.cc | 22 +++++++++++---- .../fluid/operators/reader/buffered_reader.cc | 2 -- .../operators/reader/create_py_reader_op.cc | 2 -- .../fluid/operators/reader/open_files_op.cc | 2 -- 7 files changed, 31 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index abc6b9f559..85898af417 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -27,34 +27,31 @@ namespace framework { namespace details { ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector> var_infos_list, - std::vector places, + std::vector var_infos, std::vector places, std::unique_ptr &&underlying_executor) : strategy_(std::move(strategy)), underlying_executor_(std::move(underlying_executor)), local_scopes_(std::move(local_scopes)), - var_infos_list_(std::move(var_infos_list)), + var_infos_(std::move(var_infos)), places_(std::move(places)) {} FeedFetchList ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { if (drop_scope_counter_ == 0) { // Create local scopes. - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &scope = local_scopes_[i]; + for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { + auto &scope = *it; Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; - for (auto &var_infos : var_infos_list_) { - for (auto &info : var_infos) { - if (scope->FindVar(info.name_) != nullptr) { - continue; - } - if (info.persistable_) { // Persistable - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - InitializeVariable(local_scope.Var(info.name_), info.type_); - } + for (auto &info : var_infos_) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); } } } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 51230d4a42..5e87e0bf50 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -38,8 +38,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { public: ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, - std::vector> var_info_list, - std::vector places, + std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); const ir::Graph& Graph() const override { @@ -54,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; std::vector local_scopes_; - std::vector> var_infos_list_; + std::vector var_infos_; std::vector places_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 677a293794..cebf63364d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -216,7 +216,6 @@ void ThreadedSSAGraphExecutor::RunOp( if (LIKELY(!strategy_.dry_run_)) { op->Run(strategy_.use_cuda_); } - VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 186f0cb803..2a9ca3e815 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -141,7 +141,6 @@ ParallelExecutor::ParallelExecutor( std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { - VLOG(1) << "kParallelGraph mode!!"; for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -178,8 +177,8 @@ ParallelExecutor::ParallelExecutor( ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graphs[0] = ref_cnt_pass->Apply(std::move(graphs[i])); - graphs[0]->SetNotOwned("garbage_collector", &gcs_); + graphs[i] = ref_cnt_pass->Apply(std::move(graphs[i])); + graphs[i]->SetNotOwned("garbage_collector", &gcs_); } } } @@ -192,6 +191,18 @@ ParallelExecutor::ParallelExecutor( // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars + std::vector var_infos; + for (auto &graph : graphs) { + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } + } + } + /** std::vector> var_infos_list; for (size_t i = 0; i < graphs.size(); ++i) { std::vector var_infos; @@ -203,8 +214,9 @@ ParallelExecutor::ParallelExecutor( var_infos.back().persistable_ = node->Var()->Persistable(); } } - var_infos_list.emplace_back(std::move(var_infos)); + var_infos_list.push_back(std::move(var_infos)); } + **/ // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { @@ -236,7 +248,7 @@ ParallelExecutor::ParallelExecutor( } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos_list), + exec_strategy, member_->local_scopes_, std::move(var_infos), member_->places_, std::move(member_->executor_))); } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index cfa192f8e1..26ff221dfa 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -58,9 +58,7 @@ void BufferedReader::ReadAsync(size_t i) { TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { - VLOG(1) << "launch tensor copy from cpu to cpu, idx: " << i; framework::TensorCopySync(cpu[i], place_, &gpu[i]); - VLOG(1) << "done " << i; gpu[i].set_lod(cpu[i].lod()); } } diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 093b0e56b3..901a92ab5b 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -28,10 +28,8 @@ class PyReader : public framework::FileReader { } void ReadNext(std::vector* out) override { - VLOG(1) << "come in PyReader::ReadNext function, out: " << out; bool success; *out = queue_->Pop(&success); - VLOG(1) << "call PyReader::ReadNext " << success; if (!success) out->clear(); } diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index ae37a18725..38223e0699 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -115,12 +115,10 @@ class PreemptiveReaderContainer : public IReaderContainer { } void ReadNext(std::vector* out) override { - VLOG(1) << "flag"; if (!pending_.empty()) { auto future_it = complete_queue_.Pop(); FutureItem item = future_it->get(); if (item.exception_) { - VLOG(1) << "item has exception!!!"; for (auto it = futures_.begin(); it != futures_.end(); ++it) { if (it != future_it) { it->wait(); // Wait all other threads complete. From 8e3fe2d7355c09a3dde09bcbf63971ff3bfe169d Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 18:57:57 +0800 Subject: [PATCH 212/719] add skip op --- paddle/fluid/framework/async_executor.cc | 8 ++++++-- paddle/fluid/framework/executor_thread_worker.cc | 15 ++++++++++----- paddle/fluid/framework/executor_thread_worker.h | 2 ++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 7685883dd5..f96ff436da 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -95,8 +95,12 @@ void AsyncExecutor::InitParamConfig() { } } _param_config.slot_dim = _param_config.fea_dim - 2; //TODO - _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().pull_dense_per_batch()); - _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); ++t) { + _param_config.skip_op.push_back(_pslib_ptr->get_param()->trainer_param().skip_op(t)); + } //sparse for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index e0ee9c11c9..d8320b422b 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -340,16 +340,21 @@ void AsyncExecutorThreadWorker::SetPullDenseThread(std::shared_ptrType().find("sgd") != std::string::npos) { continue; } - if (op->Type().find("lookup_table") != std::string::npos || - op->Type().find("lookup_table_grad") != std::string::npos) { - continue; + bool need_skip = false; + for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { + if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); } - op->Run(*thread_scope_, place_); } UpdateParams(); } diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 4e3255a590..b3ee9dfaec 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -39,6 +39,8 @@ struct AsyncWorkerParamConfig { int fea_dim; int32_t tmp_push_dense_wait_times; int32_t tmp_push_sparse_wait_times; + + std::vector skip_op; std::map> dense_variable_name; std::map> dense_gradient_variable_name; From 067ed70f2def19263d33ef792323c439fa482484 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 10 Dec 2018 19:26:50 +0800 Subject: [PATCH 213/719] add HasProtoAttr function in op_desc.h, clean node.h test=develop --- paddle/fluid/framework/ir/is_test_pass.cc | 2 +- .../fluid/framework/ir/is_test_pass_tester.cc | 4 ++-- .../framework/ir/mkldnn_placement_pass.cc | 15 +++++++----- paddle/fluid/framework/ir/node.cc | 22 ----------------- paddle/fluid/framework/ir/node.h | 12 ---------- paddle/fluid/framework/op_desc.cc | 24 ++++++++----------- paddle/fluid/framework/op_desc.h | 6 ++++- paddle/fluid/pybind/tensor_py.h | 2 +- 8 files changed, 28 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 6d8f020918..57cc98e2ca 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr IsTestPass::ApplyImpl( for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (n->RuntimeHasAttr("is_test")) { + if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) { op->SetAttr("is_test", true); } else if (std::find(begin(op_list), end(op_list), op->Type()) != end(op_list)) { diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index d9a68c7f1d..9696441a21 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { auto* op = node->Op(); auto op_name = boost::get(op->GetAttr("name")); if (op_name == "conv3") { - ASSERT_FALSE(node->RuntimeHasAttr("is_test")); + ASSERT_FALSE(op->HasAttr("is_test")); } else { - ASSERT_TRUE(node->RuntimeHasAttr("is_test")); + ASSERT_TRUE(op->HasAttr("is_test")); EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); } } diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 9a9314161b..951fcb066c 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -25,12 +25,15 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( const auto& op_types_list = Get>("mkldnn_enabled_op_types"); for (const Node* n : graph->Nodes()) { - if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { - if (op_types_list.empty()) { - n->Op()->SetAttr("use_mkldnn", true); - } else if (std::find(op_types_list.begin(), op_types_list.end(), - n->Name()) != op_types_list.end()) { - n->Op()->SetAttr("use_mkldnn", true); + if (n->IsOp()) { + auto* op = n->Op(); + if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) { + if (op_types_list.empty()) { + op->SetAttr("use_mkldnn", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + op->SetAttr("use_mkldnn", true); + } } } } diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 7a88cb2b68..eac67108e2 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,28 +30,6 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } -bool Node::RuntimeHasAttr(const std::string &name) const { - if (Op()->HasAttr(name)) { - return true; - } else { - auto &op_info = OpInfoMap::Instance(); - auto op_type = Op()->Type(); - if (op_info.Has(op_type)) { - auto op_info_ptr = op_info.Get(op_type); - if (op_info_ptr.HasOpProtoAndChecker()) { - const proto::OpProto &proto = op_info_ptr.Proto(); - for (int i = 0; i != proto.attrs_size(); ++i) { - const proto::OpProto::Attr &attr = proto.attrs(i); - if (attr.name() == name) { - return true; - } - } - } - } - } - return false; -} - } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 1044a96430..d2a393b3f1 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -108,18 +108,6 @@ class Node { Name().find(ir::Node::kControlDepVarName) != std::string::npos; } - // RuntimeHasAttr is different with HasAttr now. - // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr, - // thus, if stored program_desc_ are old which don't have an attr, a new - // library which adds the attr already will fail on this function. - // Details: - // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087 - // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above - // problem. - // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding - // RuntimeHasAttr. - bool RuntimeHasAttr(const std::string& name) const; - std::vector inputs; std::vector outputs; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index ce7ba96730..dde642764f 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -237,20 +237,16 @@ void OpDesc::SetOutput(const std::string ¶m_name, this->outputs_[param_name] = args; } -bool OpDesc::HasAttr(const std::string &name) const { - if (attrs_.find(name) != attrs_.end()) { - return true; - } else { - auto &op_info = OpInfoMap::Instance(); - if (op_info.Has(desc_.type())) { - auto op_info_ptr = op_info.Get(desc_.type()); - if (op_info_ptr.HasOpProtoAndChecker()) { - const proto::OpProto &proto = op_info_ptr.Proto(); - for (int i = 0; i != proto.attrs_size(); ++i) { - const proto::OpProto::Attr &attr = proto.attrs(i); - if (attr.name() == name) { - return true; - } +bool OpDesc::HasProtoAttr(const std::string &name) const { + auto &op_info = OpInfoMap::Instance(); + if (op_info.Has(desc_.type())) { + auto op_info_ptr = op_info.Get(desc_.type()); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; } } } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 3da7cdcef3..e8debec7f1 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -61,7 +61,11 @@ class OpDesc { void SetOutput(const std::string ¶m_name, const std::vector &args); - bool HasAttr(const std::string &name) const; + bool HasAttr(const std::string &name) const { + return attrs_.find(name) != attrs_.end(); + } + + bool HasProtoAttr(const std::string &name) const; proto::AttrType GetAttrType(const std::string &name) const; diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 02a75236f6..f67f40f19f 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (int i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } From 33a004a779e8c4acb19ab13b641cc16d3827a582 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 10 Dec 2018 20:36:49 +0800 Subject: [PATCH 214/719] fix numel nce and prefetch --- .../distributed/parameter_prefetch.cc | 10 +++++++-- paddle/fluid/operators/nce_op.h | 21 ++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 4cdeae81a1..aebf6376d1 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -114,9 +114,15 @@ static void MergeMultipleVarsIntoOneBySection( id_to_offset[ids_vector[i]].push_back(i); } - auto& id_tensor = scope.FindVar(id_name)->Get(); + auto& id_tensor = scope->FindVar(id_name)->Get(); auto* out_tensor = - scope.FindVar(out_name)->GetMutable(); + scope->FindVar(out_name)->GetMutable(); + + PADDLE_ENFORCE_GT( + out_tensor->numel(), 0, + "When calling this method, the Tensor's numel must larger than zero. " + "Please check Tensor::Resize has been called first."); + auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 862064be18..99a3baba92 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -166,11 +166,12 @@ class NCEKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto &local_scope = context.scope().NewScope(); + framework::Scope &local_scope = context.scope().NewScope(); + auto height_sections = context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - auto *ids = local_scope.Var("Ids@Local"); + auto *ids = local_scope.Var("Ids@Prefetch"); auto *x_tensor = ids->GetMutable(); x_tensor->mutable_data( framework::make_ddim({static_cast(labels.size()), 1}), @@ -179,12 +180,18 @@ class NCEKernel : public framework::OpKernel { std::memcpy(x_tensor->data(), labels.data(), labels.size() * sizeof(int64_t)); - local_scope.Var("Weight@Local"); + std::vector w_dims = paddle::framework::vectorize2int( + context.Input("Weight")->dims()); + w_dims[0] = static_cast(labels.size()); + + auto *w_tensor = local_scope.Var("Weight@Prefetch") + ->GetMutable(); + w_tensor->Resize(framework::make_ddim(w_dims)); #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch("Ids@Local", "Weight@Local", table_names, - epmap, height_sections, context, - &local_scope); + operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch", + table_names, epmap, height_sections, + context, local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " @@ -192,7 +199,7 @@ class NCEKernel : public framework::OpKernel { #endif auto weight_mat = EigenMatrix::From( - (local_scope.Var("Weight@Local")->Get())); + (local_scope.Var("Weight@Prefetch")->Get())); for (int64_t i = 0; i < sample_labels->numel(); ++i) { std::vector::iterator it = std::find(labels.begin(), labels.end(), sample_labels_data[i]); From 00776b167aa5bdfb8a9888f764ec8fa3a0b6f159 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 10 Dec 2018 20:49:41 +0800 Subject: [PATCH 215/719] fix memory opt skip set by name (#14774) * random failed. rerun ci. test=develop * windows failed. rerun ci. test=develop --- .../test_memory_optimization_transpiler.py | 37 ++++++++++++++----- .../memory_optimization_transpiler.py | 16 ++++++++ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py index 275e5c49d5..fa16f08288 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py @@ -22,6 +22,15 @@ from paddle.fluid.framework import Program, program_guard from paddle.fluid.transpiler import memory_optimize +def _get_vars(prog): + assert (isinstance(prog, Program)) + all_vars = set() + for op in prog.global_block().ops: + all_vars.update(op.input_arg_names) + all_vars.update(op.output_arg_names) + return all_vars + + class TestControlFlowGraph(unittest.TestCase): def setUp(self): program = Program() @@ -37,11 +46,11 @@ class TestControlFlowGraph(unittest.TestCase): self.program = program def test_control_flow_graph(self): - print("before optimization") - print(str(self.program)) - result_program = memory_optimize(self.program) - print("after optimization") - print(str(result_program)) + result_program = self.program.clone() + memory_optimize(self.program) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) class TestMemoryTranspiler2(unittest.TestCase): @@ -58,14 +67,22 @@ class TestMemoryTranspiler2(unittest.TestCase): avg_cost = layers.mean(cost) opt = optimizer.SGD(learning_rate=0.001) opt.minimize(avg_cost) + self.skip_set = set([cost.name, fc.name]) self.program = program def test_inplace_ops(self): - print("before optimization") - print(str(self.program)) - result_program = memory_optimize(self.program) - print("after optimization") - print(str(result_program)) + result_program = self.program.clone() + memory_optimize(self.program) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) + + def test_skip_opt(self): + result_program = self.program.clone() + memory_optimize(self.program, skip_opt_set=self.skip_set) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) class TestMemoryTranspiler3(unittest.TestCase): diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index c9f1be9347..95aafec053 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,6 +14,7 @@ from __future__ import print_function +import six from collections import defaultdict, MutableSet from .. import core from ... import compat as cpt @@ -470,8 +471,21 @@ def memory_optimize(input_program, Returns: None """ + + def to_name_str(var): + if isinstance(var, Variable): + return var.desc.name() + elif isinstance(var, str): + return var + elif isinstance(var, six.string_types): + return str(var) + else: + raise TypeError(str(var) + " should be Variable or str") + if level != 0 and level != 1: raise ValueError("only support opt_level 0 or 1.") + if skip_opt_set is not None and not isinstance(skip_opt_set, set): + raise ValueError("only support skip_opt_set as set.") global PRINT_LOG PRINT_LOG = print_log if skip_grads: @@ -486,6 +500,8 @@ def memory_optimize(input_program, skip_opt_set = grad_set else: skip_opt_set.update(grad_set) + if skip_opt_set is not None: + skip_opt_set = set(map(to_name_str, skip_opt_set)) cfgs = _get_cfgs(input_program) for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) From 8760d23c7dbcb4ad5a5b941aca5917514467c86d Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 10 Dec 2018 13:09:28 +0000 Subject: [PATCH 216/719] featue/py_func --- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/py_func_op.cc | 131 ++++++++++++++++++++++++++ paddle/fluid/operators/py_func_op.h | 25 +++++ paddle/fluid/pybind/pybind.cc | 21 +++++ python/paddle/fluid/layers/nn.py | 112 +++++++++++++++++++++- 5 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/py_func_op.cc create mode 100644 paddle/fluid/operators/py_func_op.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f..9379122faf 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -82,7 +82,7 @@ endif() # op_library(unstack_op DEPS stack_op) # op_library(tensor_array_to_tensor_op DEPS concat_op) -set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) +set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS} python pybind) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(gather_test SRCS gather_test.cc DEPS tensor) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc new file mode 100644 index 0000000000..86914f3060 --- /dev/null +++ b/paddle/fluid/operators/py_func_op.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/py_func_op.h" +#include +#include +#include +#include "Python.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +namespace py = pybind11; + +static std::mutex g_py_callables_mtx; +static std::vector g_py_callables; + +size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { + std::lock_guard guard(g_py_callables_mtx); + g_py_callables.emplace_back(py_obj); + return g_py_callables.size() - 1; +} + +static py::object *GetPythonCallableObject(size_t i) { + std::lock_guard guard(g_py_callables_mtx); + PADDLE_ENFORCE_LT(i, g_py_callables.size()); + return &g_py_callables[i]; +} + +void DoCallPythonFunc(py::object *callable, const std::string &func_token, + const std::vector &ins, + std::vector *out) { + py::gil_scoped_acquire guard{}; + py::tuple in_args(ins.size()); + for (size_t i = 0; i < ins.size(); ++i) { + in_args[i] = py::cast(ins[i]); + } + + auto ret = (*callable)(func_token, *in_args); + auto ret_tuple = py::cast(ret); + PADDLE_ENFORCE_EQ(py::len(ret_tuple), out->size(), "Output number not match"); + for (size_t i = 0; i < out->size(); ++i) { + try { + auto *out_tensor = py::cast(ret_tuple[i]); + PADDLE_ENFORCE_NOT_NULL(out_tensor, + "Output tensor should not be nullptr"); + (*out)[i]->set_lod(out_tensor->lod()); + (*out)[i]->ShareDataWith(*out_tensor); + } catch (py::cast_error &) { + PADDLE_THROW("Output %d is not LoDTensor", i); + } + } +} + +class PyFuncOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs("X"), "Input(X) must exist"); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), "Output(Out) must exist"); + } +}; + +class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Inputs of py_func op.").AsDuplicable(); + AddOutput("Out", "Outputs of py_func op").AsDuplicable(); + AddAttr("token", "function token"); + AddAttr("handle_idx", "handle index").SetDefault(0); + AddComment(R"DOC("PyFunc Op")DOC"); + } +}; + +class PyFuncOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + protected: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &in_arg_names = Inputs("X"); + auto &out_arg_names = Outputs("Out"); + + std::vector inputs(in_arg_names.size()); + for (size_t i = 0; i < in_arg_names.size(); ++i) { + auto &in_tensor = + scope.FindVar(in_arg_names[i])->Get(); + if (platform::is_gpu_place(in_tensor.place())) { + framework::TensorCopySync(in_tensor, platform::CPUPlace(), &inputs[i]); + } else { + inputs[i].ShareDataWith(in_tensor); + } + inputs[i].set_lod(in_tensor.lod()); + } + + std::vector outputs(out_arg_names.size()); + for (size_t i = 0; i < out_arg_names.size(); ++i) { + auto *out_tensor = + scope.FindVar(out_arg_names[i])->GetMutable(); + outputs[i] = out_tensor; + } + + auto &token = Attr("token"); + auto handle_idx = static_cast(Attr("handle_idx")); + auto *py_callable = GetPythonCallableObject(handle_idx); + VLOG(10) << "Call py_func_op with token " << token << ", and handle_idx " + << handle_idx; + DoCallPythonFunc(py_callable, token, inputs, &outputs); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker, + ops::PyFuncOpShapeInference, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/py_func_op.h b/paddle/fluid/operators/py_func_op.h new file mode 100644 index 0000000000..e85fa6b5bc --- /dev/null +++ b/paddle/fluid/operators/py_func_op.h @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace operators { + +size_t AppendPythonCallableObjectAndReturnId(pybind11::object py_obj); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b2..58da2cea34 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/version.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" @@ -100,6 +101,12 @@ PYBIND11_MODULE(core, m) { BindException(&m); + m.def( + "append_python_callable_object_and_return_id", + [](py::object py_obj) -> size_t { + return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); + }); + py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -525,6 +532,20 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Place") .def(py::init<>()) + .def("is_cpu_place", + [](platform::Place &self) { return platform::is_cpu_place(self); }) + .def("is_gpu_place", + [](platform::Place &self) { return platform::is_gpu_place(self); }) + .def("is_cuda_pinned_place", + [](platform::Place &self) { + return platform::is_cuda_pinned_place(self); + }) + .def("gpu_device_id", + [](platform::Place &self) { + PADDLE_ENFORCE(platform::is_gpu_place(self), + "gpu_device_id() only supports in CUDAPlace"); + return boost::get(self).device; + }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4833212d31..92cd53a6c3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -18,10 +18,12 @@ All layers just related to the neural network. from __future__ import print_function import numpy as np +import six import os +import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant -from ..framework import Variable, OpProtoHolder +from ..framework import Variable, OpProtoHolder, Program from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .tensor import concat @@ -172,6 +174,7 @@ __all__ = [ 'merge_selected_rows', 'get_tensor_from_selected_rows', 'lstm', + 'py_func', ] kIgnoreIndex = -100 @@ -9082,3 +9085,110 @@ def get_tensor_from_selected_rows(x, name=None): outputs={'Out': out}, attrs={}) return out + + +@templatedoc() +def py_func(func, x, out, backward_func=None): + """ + """ + + class PyFuncRegister(object): + _main_program_to_register = dict() + + @classmethod + def get_instance(cls, prog=None): + if prog is None: + prog = fluid.default_main_program() + + if not isinstance(prog, Program): + raise ValueError("prog must be None or type of Program") + + ret = cls._main_program_to_register.get(prog, None) + if ret is None: + ret = PyFuncRegister() + ret._idx = core.append_python_callable_object_and_return_id(ret) + ret._token_func_dict = dict() + ret._func_token_dict = dict() + cls._main_program_to_register[prog] = ret + + return ret + + @property + def handle_idx(self): + return self._idx + + def unique_token(self, func): + return self._register_func(func) + + def _register_func(self, func): + if func is None: + raise ValueError("func cannot be None") + + token = self._func_token_dict.get(func, None) + if token is not None: + return token + + token = unique_name.generate('py_func_op_token') + self._token_func_dict[token] = func + self._func_token_dict[func] = token + return token + + def __call__(self, token, *args): + func = self._token_func_dict.get(token, None) + if func is None: + raise ValueError("func has not been registered") + + arg_list = inspect.getargspec(func) + kwargs = dict() + idx = 0 + for arg in arg_list[0]: + kwargs[arg] = args[idx] + idx += 1 + + args = args[idx:] + ret0 = func(*args, **kwargs) + if ret0 is None: + return None + + if not isinstance(ret0, (list, tuple)): + ret0 = (ret0, ) + + ret = [] + for i in six.moves.range(len(ret0)): + if isinstance(ret0[i], core.LoDTensor): + ret.append(ret0[i]) + continue + + if isinstance(ret0[i], np.ndarray): + r = ret0[i] + else: + r = np.array(ret0[i]) + + t = core.LoDTensor() + t.set(r, core.CPUPlace()) + ret.append(t) + + return tuple(ret) + + helper = LayerHelper('py_func', **locals()) + if isinstance(x, Variable): + x = [x] + + if isinstance(out, Variable): + out = [out] + + for each_out in out: + if len(each_out.shape) == 0: + raise ValueError( + 'users should infer shapes of outputs of py_func op manually') + + py_func_reg = PyFuncRegister.get_instance(helper.main_program) + token = py_func_reg.unique_token(func) + + helper.append_op( + type='py_func', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'handle_idx': py_func_reg.handle_idx, + 'token': token}) + return out From 016a06877578a6c862d5fd7eef3c1c75a71adc81 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 10 Dec 2018 21:34:37 +0800 Subject: [PATCH 217/719] stop server --- paddle/fluid/framework/async_executor.cc | 4 ++++ paddle/fluid/framework/async_executor.h | 1 + .../fluid/framework/executor_thread_worker.cc | 18 +++++++++--------- paddle/fluid/pybind/async_executor_py.cc | 1 + python/paddle/fluid/async_executor.py | 9 ++++++++- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index f96ff436da..45a914b70e 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -83,6 +83,10 @@ uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); } +void AsyncExecutor::StopServer() { + _pslib_ptr->stop_server(); +} + void AsyncExecutor::GatherServers(std::vector& host_sign_list, int node_num) { _pslib_ptr->gather_servers(host_sign_list.data(), node_num); } diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 90d6b46b2f..4b46126217 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -67,6 +67,7 @@ class AsyncExecutor { void InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); //void ConfigWorker() {} uint64_t StartServer(); + void StopServer(); void GatherServers(std::vector& host_sign_list, int node_num); void InitModel(); void SaveModel(const std::string& path); diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index d8320b422b..a0455b26ef 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -569,7 +569,6 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { } void AsyncExecutorThreadWorker::PushSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; //TODO auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO auto& features = _features[table_id]; @@ -592,19 +591,20 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { } Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); LoDTensor* g_tensor = g_var->GetMutable(); - //int count = g_tensor->numel(); - float* g = g_tensor->data(); - /* - if (FLAGS_scale_sparse_gradient_with_batch_size) { - Eigen::Map g_mat(g, 1, tensor->numel()); - g_mat *= _batch_size; + if (g_tensor == NULL) { + LOG(ERROR) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + exit(-1); } - */ + float* g = g_tensor->data(); Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; + exit(-1); + } int len = tensor->lod()[0].back(); - //assert(slot_dim * len == count); + assert(slot_dim * len == g_tensor->numel()); int64_t* ids = tensor->data(); for (auto id_idx = 0u; id_idx < len; ++id_idx){ if (ids[id_idx] == 0) { diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index eca46fbad5..8dfba0d269 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -51,6 +51,7 @@ void BindAsyncExecutor(py::module* m) { .def("init_server", &framework::AsyncExecutor::InitServer) .def("init_worker", &framework::AsyncExecutor::InitWorker) .def("start_server", &framework::AsyncExecutor::StartServer) + .def("stop_server", &framework::AsyncExecutor::StopServer) .def("gather_servers", &framework::AsyncExecutor::GatherServers) .def("init_model", &framework::AsyncExecutor::InitModel) .def("save_model", &framework::AsyncExecutor::SaveModel); diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 3451d1edb5..76fdb5b0e2 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -151,7 +151,10 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) - self.instance.barrier_all() + self.instance.barrier_all() #worker do all things + if self.instance.is_first_worker(): + self.executor.stop_server() + self.instance.barrier_all() #sync def config_distributed_nodes(self, dist_opt): @@ -164,6 +167,9 @@ class AsyncExecutor(object): def get_instance(self): return self.instance + #def stop_server(self): + # self.executor.stop_server() + def init_server(self, dist_desc): self.executor.init_server(dist_desc, self.instance._rankid) ip = self.executor.start_server() @@ -174,6 +180,7 @@ class AsyncExecutor(object): self.instance.barrier_all() #wait all worker start self.instance.barrier_all() #wait init model self.instance.barrier_all() #wait worker do all things + self.instance.barrier_all() #sync def init_worker(self, dist_desc): self.instance.barrier_all() #wait all server start From 1735022a1bdaa5777009bf537dd6c09be17e33fc Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 11 Dec 2018 09:36:07 +0800 Subject: [PATCH 218/719] fix clang test=develop --- paddle/fluid/framework/op_registry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 36673e48c2..6d39bb3c52 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -319,7 +319,7 @@ struct OpKernelRegistrarFunctorEx Date: Tue, 11 Dec 2018 10:41:18 +0800 Subject: [PATCH 219/719] fix numel nce and prefetch test=develop --- paddle/fluid/operators/nce_op.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 99a3baba92..2c97eef096 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -49,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context, auto label = context.Input("Label"); const int64_t *label_data = label->data(); auto label_dims = label->dims(); - // int num_total_classes = context.Attr("num_total_classes"); // for unitest std::vector custom_neg_classes = context.Attr>("custom_neg_classes"); From 0a7c7e97afc20d26406de27a22c9cd4b7edad8b0 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 11 Dec 2018 10:45:16 +0800 Subject: [PATCH 220/719] test zero output of split_selected_rows_op test=develop --- paddle/fluid/pybind/pybind.cc | 2 ++ .../fluid/tests/unittests/test_split_selected_rows_op.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b2..9d92529e4e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -298,6 +298,8 @@ PYBIND11_MODULE(core, m) { .def("get_tensor", [](SelectedRows &self) { return self.mutable_value(); }, py::return_value_policy::reference) + .def("numel", + [](SelectedRows &self) -> int64_t { return self.value().numel(); }) .def("set_height", &SelectedRows::set_height) .def("height", &SelectedRows::height) .def("set_rows", diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py index 50204b8a77..f8847e1570 100644 --- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py @@ -63,6 +63,7 @@ class TestSpliteSelectedRows(unittest.TestCase): # expected output selected rows expected_out0_rows = [0, 4] expected_out1_rows = [0, 2] + expected_out2_rows = [] expected_out4_rows = [0] op = Operator( @@ -75,6 +76,7 @@ class TestSpliteSelectedRows(unittest.TestCase): self.assertEqual(outs[0].rows(), expected_out0_rows) self.assertEqual(outs[1].rows(), expected_out1_rows) + self.assertEqual(outs[2].rows(), expected_out2_rows) self.assertEqual(outs[4].rows(), expected_out4_rows) self.assertEqual(outs[0].height(), height_sections[0]) @@ -84,6 +86,9 @@ class TestSpliteSelectedRows(unittest.TestCase): self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1]) self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1]) + self.assertEqual(outs[2].numel(), 0) + self.assertEqual(outs[3].numel(), 0) + def check_grad_with_place(self, place): scope = core.Scope() height = 10 From 60d71a9e2987941487f7f1e44d1e1850b41a1e3d Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 11:12:49 +0800 Subject: [PATCH 221/719] skip op py file --- python/paddle/fluid/distributed/downpour.py | 1 + python/paddle/fluid/distributed/node.py | 2 +- python/paddle/fluid/distributed/ps_pb2.py | 93 +++++++++++---------- 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 654fa6fab6..c1762dd768 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -66,5 +66,6 @@ class DownpourSGD(object): # Todo(guru4elephant): figure out how to support more sparse parameters # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] + ps_param.trainer_param.skip_op.extend(worker_skipped_ops) ps_param_str = text_format.MessageToString(ps_param) return [ps_param, worker_skipped_ops] diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index c245dc4db8..1f4aeeac73 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -60,7 +60,7 @@ class DownpourServer(Server): table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 fea_dim = 0 - for param in param_var: + for param in filter(lambda x: x.name.find("embedding") == -1, param_var): fea_dim += reduce(lambda x, y: x * y, param.shape, 1) table.accessor.fea_dim = fea_dim diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index b82c649e14..978b18d0d5 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -20,7 +20,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( name='ps.proto', package='paddle', syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xbc\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1c\n\x14pull_dense_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd1\x01\n\x16ServerServiceParameter\x12(\n\x0cserver_class\x18\x01 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsServer\x12(\n\x0c\x63lient_class\x18\x02 \x01(\t:\x12\x41\x62\x61\x63usBrpcPsClient\x12&\n\rservice_class\x18\x03 \x01(\t:\x0f\x41\x62\x61\x63usPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') + serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') ) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -41,8 +41,8 @@ _TABLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3262, - serialized_end=3314, + serialized_start=3286, + serialized_end=3338, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) @@ -108,8 +108,8 @@ _PSCMDID = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3317, - serialized_end=3634, + serialized_start=3341, + serialized_end=3658, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) @@ -148,8 +148,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3230, - serialized_end=3260, + serialized_start=3254, + serialized_end=3284, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) @@ -342,7 +342,7 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( is_extension=False, extension_scope=None, options=None), _descriptor.FieldDescriptor( - name='pull_dense_per_batch', full_name='paddle.DownpourTrainerParameter.pull_dense_per_batch', index=2, + name='push_sparse_per_batch', full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', index=2, number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, @@ -355,6 +355,13 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), + _descriptor.FieldDescriptor( + name='skip_op', full_name='paddle.DownpourTrainerParameter.skip_op', index=4, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), ], extensions=[ ], @@ -368,7 +375,7 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( oneofs=[ ], serialized_start=557, - serialized_end=745, + serialized_end=763, ) @@ -419,8 +426,8 @@ _DENSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=747, - serialized_end=870, + serialized_start=765, + serialized_end=888, ) @@ -478,8 +485,8 @@ _SPARSETABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=872, - serialized_end=994, + serialized_start=890, + serialized_end=1012, ) @@ -516,8 +523,8 @@ _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=997, - serialized_end=1131, + serialized_start=1015, + serialized_end=1149, ) @@ -575,8 +582,8 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1134, - serialized_end=1343, + serialized_start=1152, + serialized_end=1367, ) @@ -641,8 +648,8 @@ _TABLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1346, - serialized_end=1537, + serialized_start=1370, + serialized_end=1561, ) @@ -721,8 +728,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1540, - serialized_end=1909, + serialized_start=1564, + serialized_end=1933, ) @@ -794,8 +801,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1912, - serialized_end=2118, + serialized_start=1936, + serialized_end=2142, ) @@ -839,8 +846,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2120, - serialized_end=2203, + serialized_start=2144, + serialized_end=2227, ) @@ -898,8 +905,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2205, - serialized_end=2306, + serialized_start=2229, + serialized_end=2330, ) @@ -950,8 +957,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2308, - serialized_end=2427, + serialized_start=2332, + serialized_end=2451, ) @@ -1009,8 +1016,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2430, - serialized_end=2655, + serialized_start=2454, + serialized_end=2679, ) @@ -1068,8 +1075,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2658, - serialized_end=2792, + serialized_start=2682, + serialized_end=2816, ) @@ -1106,8 +1113,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2794, - serialized_end=2860, + serialized_start=2818, + serialized_end=2884, ) @@ -1137,8 +1144,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2862, - serialized_end=2921, + serialized_start=2886, + serialized_end=2945, ) @@ -1168,8 +1175,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2923, - serialized_end=2969, + serialized_start=2947, + serialized_end=2993, ) @@ -1213,8 +1220,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=2971, - serialized_end=3044, + serialized_start=2995, + serialized_end=3068, ) @@ -1287,8 +1294,8 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=3047, - serialized_end=3260, + serialized_start=3071, + serialized_end=3284, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER From 644c13a3874b565b7bcda9beab9e9bb271e4ba3e Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 11 Dec 2018 03:38:58 +0000 Subject: [PATCH 222/719] fix compile error --- paddle/fluid/inference/tests/api/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 6901aac3c3..8a4bc04b67 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,8 +1,7 @@ -set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor benchmark) if(WITH_GPU AND TENSORRT_FOUND) - set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} -ir_pass_manager analysis_predictor benchmark) + set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor) endif() function(download_model install_dir model_name) From 7604b1ad519b9581995e4d5aed148f2893c70702 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 13:14:50 +0800 Subject: [PATCH 223/719] Fix Eigen macro when using GPU The macro should be defined by compiler rather than by source. test=develop --- cmake/configure.cmake | 1 + paddle/fluid/operators/bilinear_tensor_product_op.cu | 1 - paddle/fluid/operators/cos_sim_op.cu | 2 -- paddle/fluid/operators/crop_op.cu | 2 -- paddle/fluid/operators/dropout_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_add_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_div_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_max_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_min_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_mul_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_pow_op.cu | 2 -- paddle/fluid/operators/elementwise/elementwise_sub_op.cu | 2 -- paddle/fluid/operators/expand_op.cu | 3 --- paddle/fluid/operators/gru_unit_op.cu | 2 -- paddle/fluid/operators/hinge_loss_op.cu | 2 -- paddle/fluid/operators/huber_loss_op.cu | 2 -- paddle/fluid/operators/im2sequence_op.cu | 2 -- paddle/fluid/operators/isfinite_op.cu | 2 -- paddle/fluid/operators/l1_norm_op.cu | 2 -- paddle/fluid/operators/log_loss_op.cu | 2 -- paddle/fluid/operators/math/context_project.cu | 3 --- paddle/fluid/operators/math/math_function.cu | 2 -- paddle/fluid/operators/math/sequence2batch.cu | 2 -- paddle/fluid/operators/math/softmax.cu | 3 --- paddle/fluid/operators/mean_op.cu | 3 --- paddle/fluid/operators/optimizers/adadelta_op.cu | 2 -- paddle/fluid/operators/optimizers/adagrad_op.cu | 2 -- paddle/fluid/operators/optimizers/adam_op.cu | 2 -- paddle/fluid/operators/optimizers/adamax_op.cu | 2 -- paddle/fluid/operators/optimizers/decayed_adagrad_op.cu | 2 -- paddle/fluid/operators/optimizers/ftrl_op.cu | 2 -- paddle/fluid/operators/optimizers/proximal_adagrad_op.cu | 2 -- paddle/fluid/operators/optimizers/proximal_gd_op.cu | 2 -- paddle/fluid/operators/optimizers/rmsprop_op.cu | 2 -- paddle/fluid/operators/pad_constant_like_op.cu | 2 -- paddle/fluid/operators/pad_op.cu | 2 -- paddle/fluid/operators/sequence_ops/sequence_pool_op.cu | 3 --- paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu | 2 -- paddle/fluid/operators/smooth_l1_loss_op.cu | 3 --- paddle/fluid/operators/softmax_with_cross_entropy_op.cu | 3 --- paddle/fluid/operators/squared_l2_distance_op.cu | 3 --- paddle/fluid/operators/squared_l2_norm_op.cu | 2 -- paddle/fluid/operators/sum_op.cu | 2 -- paddle/fluid/platform/device_context.h | 1 - paddle/fluid/platform/float16.h | 3 --- 45 files changed, 1 insertion(+), 95 deletions(-) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4e17ddee73..51f7a61631 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -86,6 +86,7 @@ endif(NOT WITH_GOLANG) if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) + add_definitions(-DEIGEN_USE_GPU) FIND_PACKAGE(CUDA REQUIRED) diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu index 9426ffbe17..c2b4f69e68 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cu +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU #include "paddle/fluid/operators/bilinear_tensor_product_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu index 82205e9c75..3d144ca29d 100644 --- a/paddle/fluid/operators/cos_sim_op.cu +++ b/paddle/fluid/operators/cos_sim_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/cos_sim_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu index b75678217e..66cb5c452d 100644 --- a/paddle/fluid/operators/crop_op.cu +++ b/paddle/fluid/operators/crop_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/crop_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index e011f47e08..d65491267d 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include #include #include diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 2fb7eeb4b9..fed12785f4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index c5a1a7e08d..1a149298fd 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index a90dcd3ecf..5d086a1b29 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_max_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index ab77709c28..cf93e5a97a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_min_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 4d16bc38e1..833c407282 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu index 6ee0779f23..9263dbfebf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu @@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 8d9bf7c4d8..6f17d3292f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu index 60363bfc86..d95c9b6180 100644 --- a/paddle/fluid/operators/expand_op.cu +++ b/paddle/fluid/operators/expand_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/expand_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu index fc92b3d4a7..37689901ec 100644 --- a/paddle/fluid/operators/gru_unit_op.cu +++ b/paddle/fluid/operators/gru_unit_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/gru_unit_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu index 9c0a85bee6..b5ea0a702e 100644 --- a/paddle/fluid/operators/hinge_loss_op.cu +++ b/paddle/fluid/operators/hinge_loss_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/hinge_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu index 659464df9d..09c743c427 100644 --- a/paddle/fluid/operators/huber_loss_op.cu +++ b/paddle/fluid/operators/huber_loss_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/huber_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu index e0a5a90c1c..1c34640618 100644 --- a/paddle/fluid/operators/im2sequence_op.cu +++ b/paddle/fluid/operators/im2sequence_op.cu @@ -11,8 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/im2sequence_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu index 8d1268b18c..995969cd42 100644 --- a/paddle/fluid/operators/isfinite_op.cu +++ b/paddle/fluid/operators/isfinite_op.cu @@ -11,8 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/isfinite_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu index 1b48571dd7..a5c29bbf5d 100644 --- a/paddle/fluid/operators/l1_norm_op.cu +++ b/paddle/fluid/operators/l1_norm_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/l1_norm_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu index e8bf7d8159..280913c43a 100644 --- a/paddle/fluid/operators/log_loss_op.cu +++ b/paddle/fluid/operators/log_loss_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/log_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu index 16205c0e14..f04b2d1534 100644 --- a/paddle/fluid/operators/math/context_project.cu +++ b/paddle/fluid/operators/math/context_project.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/math/context_project.h" namespace paddle { diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 79b7538ad0..9372d63f0b 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/blas.h" diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu index be73adfc0c..9ab13659c1 100644 --- a/paddle/fluid/operators/math/sequence2batch.cu +++ b/paddle/fluid/operators/math/sequence2batch.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 2e9669049e..71d1373982 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index 413b8ace67..921c2e1298 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/mean_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/fluid/operators/optimizers/adadelta_op.cu index 3fbfee5df0..562a157f06 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cu +++ b/paddle/fluid/operators/optimizers/adadelta_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/adadelta_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu index 4efe56855a..5043468d4c 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/adagrad_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/optimizers/adagrad_op.h" diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index e8090ebacf..4eb2db717d 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/adam_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/fluid/operators/optimizers/adamax_op.cu index e54adcb142..80e0219d44 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cu +++ b/paddle/fluid/operators/optimizers/adamax_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/adamax_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu index 84d65e3932..dc568802a2 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cu b/paddle/fluid/operators/optimizers/ftrl_op.cu index f836b75df9..acf8e38ca0 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cu +++ b/paddle/fluid/operators/optimizers/ftrl_op.cu @@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/ftrl_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu index d1c1f747b7..591dead3b1 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu @@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu index 7aa0e10150..d556fa74f1 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu @@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/proximal_gd_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cu b/paddle/fluid/operators/optimizers/rmsprop_op.cu index 69e35a309e..8b17d6a020 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.cu +++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/optimizers/rmsprop_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu index ea69577904..9e62a6dc9d 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cu +++ b/paddle/fluid/operators/pad_constant_like_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/pad_constant_like_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu index 9cddef9cf1..95098a8dca 100644 --- a/paddle/fluid/operators/pad_op.cu +++ b/paddle/fluid/operators/pad_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/pad_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu index 63cd47a38a..4897474a48 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index 9aadac1a41..a1fbc7e5fa 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu index dfbb5c9058..e5df479090 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.cu +++ b/paddle/fluid/operators/smooth_l1_loss_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/smooth_l1_loss_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 6d48796191..cee3e87037 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu index 3e80ae8dd2..c9264da838 100644 --- a/paddle/fluid/operators/squared_l2_distance_op.cu +++ b/paddle/fluid/operators/squared_l2_distance_op.cu @@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU - #include "paddle/fluid/operators/squared_l2_distance_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu index 87830413da..e31cfeb78a 100644 --- a/paddle/fluid/operators/squared_l2_norm_op.cu +++ b/paddle/fluid/operators/squared_l2_norm_op.cu @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/squared_l2_norm_op.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index db4c2d6c11..6125ed07b6 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#define EIGEN_USE_GPU #include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 3edd727978..ce1494f170 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/gpu_info.h" -#define EIGEN_USE_GPU #endif #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 9d48557caf..98afe843c0 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -71,9 +71,6 @@ struct float16; } // namespace platform } // namespace paddle -// NOTE(): -// Do not move the eigen.h header, otherwise the eigen_vector will failed. -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/platform/hostdevice.h" #include "unsupported/Eigen/CXX11/Tensor" From eab47459658b10ec799a5dccbfd9bf8f45b9771a Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 26 Nov 2018 15:53:33 +0800 Subject: [PATCH 224/719] add adaptive mode for pool. --- paddle/fluid/operators/math/pooling.cc | 202 ++++++++++++++++++------- paddle/fluid/operators/math/pooling.cu | 26 ++-- 2 files changed, 161 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 8df43bb616..68fed9fd4e 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -19,6 +19,16 @@ namespace paddle { namespace operators { namespace math { +static inline int ADAPT_START_INDEX(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +static inline int ADAPT_END_INDEX(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + /* * All tensors are in NCHW format. * Ksize, strides, paddings are two elements. These two elements represent @@ -31,7 +41,7 @@ class Pool2dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -54,13 +64,23 @@ class Pool2dFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { @@ -68,8 +88,9 @@ class Pool2dFunctor { pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size = (exclusive || adaptive) + ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width + pw] = ele; } @@ -94,7 +115,7 @@ class Pool2dGradFunctor { const framework::Tensor& output, const framework::Tensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_grad_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -118,15 +139,26 @@ class Pool2dGradFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } + int pool_size = (exclusive || adaptive) + ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; float scale = 1.0 / pool_size; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -251,7 +283,7 @@ class Pool3dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -279,17 +311,32 @@ class Pool3dFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + } else { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int output_idx = (pd * output_height + ph) * output_width + pw; T ele = pool_process.initial(); for (int d = dstart; d < dend; ++d) { @@ -302,7 +349,7 @@ class Pool3dFunctor { } } int pool_size = - exclusive + (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); @@ -330,7 +377,7 @@ class Pool3dGradFunctor { const framework::Tensor& output, const framework::Tensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_grad_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -359,21 +406,35 @@ class Pool3dGradFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + } else { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int pool_size = - exclusive + (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; float scale = 1.0 / pool_size; @@ -517,8 +578,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -541,13 +602,23 @@ class MaxPool2dWithIndexFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } T1 ele = static_cast(-FLT_MAX); int index = -1; @@ -666,17 +737,32 @@ class MaxPool3dWithIndexFunctor { for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + } else { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); + } else { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int output_idx = (pd * output_height + ph) * output_width + pw; T1 ele = static_cast(-FLT_MAX); diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index cdc79e207a..06e92665c7 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, PoolProcess pool_process, - bool exclusive, T* output_data) { + bool exclusive, bool adaptive, T* output_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -37,13 +37,21 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); - hstart = max(hstart, 0); + if (adaptive) { + int hstart = ADAPT_START_INDEX(ph, input_height, output_height); + int hend = ADAPT_END_INDEX(ph, input_height, output_height); - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); - wstart = max(wstart, 0); + int wstart = ADAPT_START_INDEX(pw, input_width, output_width); + int wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + } input_data += (batch_idx * channels + c) * input_height * input_width; T ele = pool_process.initial(); @@ -52,8 +60,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); output_data[index] = ele; } From 266c6856c90836296f908afa5fff3e08b3ebb718 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 28 Nov 2018 22:09:23 +0800 Subject: [PATCH 225/719] add adaptive pool 2d & 3d. test=develop --- paddle/fluid/API.spec | 2 + paddle/fluid/operators/math/pooling.cc | 143 +++--- paddle/fluid/operators/math/pooling.cu | 411 +++++++++++------- paddle/fluid/operators/math/pooling.h | 20 +- paddle/fluid/operators/pool_op.cc | 26 +- paddle/fluid/operators/pool_op.h | 16 +- paddle/fluid/operators/pool_with_index_op.cc | 27 +- paddle/fluid/operators/pool_with_index_op.h | 12 +- paddle/fluid/operators/spp_op.h | 6 +- python/paddle/fluid/layers/nn.py | 186 ++++++++ .../fluid/tests/unittests/test_layers.py | 22 + .../fluid/tests/unittests/test_pool2d_op.py | 91 ++-- .../fluid/tests/unittests/test_pool3d_op.py | 121 ++++-- .../fluid/tests/unittests/test_pool_max_op.py | 95 +++- 14 files changed, 860 insertions(+), 318 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index fd4cf92d85..87ed586aad 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -77,6 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 68fed9fd4e..b4ee82add3 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -61,24 +61,26 @@ class Pool2dFunctor { const T* input_data = input.data(); T* output_data = output->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -136,24 +138,26 @@ class Pool2dGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } int pool_size = (exclusive || adaptive) @@ -308,33 +312,36 @@ class Pool3dFunctor { const T* input_data = input.data(); T* output_data = output->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); } else { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); dstart = std::max(dstart, 0); } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } int output_idx = (pd * output_height + ph) * output_width + pw; @@ -403,33 +410,36 @@ class Pool3dGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); } else { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); dstart = std::max(dstart, 0); } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -599,24 +609,26 @@ class MaxPool2dWithIndexFunctor { T1* output_data = output->mutable_data(context.GetPlace()); T2* mask_data = mask->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -655,7 +667,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; @@ -708,8 +720,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -734,33 +746,36 @@ class MaxPool3dWithIndexFunctor { T1* output_data = output->mutable_data(context.GetPlace()); T2* mask_data = mask->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - int dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); } else { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); dstart = std::max(dstart, 0); } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); } else { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); } @@ -804,7 +819,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 06e92665c7..5f3b82ed55 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -21,6 +21,18 @@ namespace paddle { namespace operators { namespace math { +__device__ __forceinline__ int ADAPT_START_INDEX(int ph, int input_size, + int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +__device__ __forceinline__ int ADAPT_END_INDEX(int ph, int input_size, + int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + template __global__ void KernelPool2D(const int nthreads, const T* input_data, const int channels, const int input_height, @@ -37,19 +49,21 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; + int hstart, hend; + int wstart, wend; if (adaptive) { - int hstart = ADAPT_START_INDEX(ph, input_height, output_height); - int hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); - int wstart = ADAPT_START_INDEX(pw, input_width, output_width); - int wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); } else { - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); + hstart = ph * stride_height - padding_height; + hend = min(hstart + ksize_height, input_height); hstart = max(hstart, 0); - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); + wstart = pw * stride_width - padding_width; + wend = min(wstart + ksize_width, input_width); wstart = max(wstart, 0); } @@ -74,7 +88,7 @@ __global__ void KernelPool2DGrad( const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, T* input_grad) { + PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int offsetW = index % input_width + padding_width; @@ -82,14 +96,24 @@ __global__ void KernelPool2DGrad( int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; - int phstart = (offsetH < ksize_height) - ? 0 - : (offsetH - ksize_height) / stride_height + 1; - int pwstart = (offsetW < ksize_width) - ? 0 - : (offsetW - ksize_width) / stride_width + 1; - int phend = min(offsetH / stride_height + 1, output_height); - int pwend = min(offsetW / stride_width + 1, output_width); + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + phend = min(offsetH / stride_height + 1, output_height); + pwend = min(offsetW / stride_width + 1, output_width); + } T gradient = 0; T input = input_data[index]; int output_idx = @@ -98,14 +122,22 @@ __global__ void KernelPool2DGrad( output_grad += output_idx; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size; + if (adaptive) { + pool_size = static_cast(ceil(static_cast(input_height) / + ksize_height)) * + static_cast( + ceil(static_cast(input_width) / ksize_width)); + } else { + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + pool_size = exclusive ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; + } int output_sub_idx = ph * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], output_grad[output_sub_idx], @@ -189,7 +221,7 @@ void Pool2dDirectCUDAFunctor::operator()( KernelPool2D<<>>( nthreads, input, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, - padding_height, padding_width, pool_compute, exclusive, output); + padding_height, padding_width, pool_compute, exclusive, false, output); } /* @@ -204,7 +236,7 @@ class Pool2dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -231,7 +263,7 @@ class Pool2dFunctor { nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, pool_process, exclusive, - output_data); + adaptive, output_data); } }; @@ -250,7 +282,8 @@ class Pool2dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -278,7 +311,7 @@ class Pool2dGradFunctor { nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, - pool_process, exclusive, input_grad_data); + pool_process, exclusive, adaptive, input_grad_data); } }; @@ -367,7 +400,7 @@ __global__ void KernelPool3D( const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, T* output_data) { + PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -376,15 +409,30 @@ __global__ void KernelPool3D( int c = (index / output_width / output_height / output_depth) % channels; int batch_idx = index / output_width / output_height / output_depth / channels; - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); + + int dstart, dend; + int hstart, hend; + int wstart, wend; + if (adaptive) { + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); + + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + dstart = pd * stride_depth - padding_depth; + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + dend = min(dstart + ksize_depth, input_depth); + hend = min(hstart + ksize_height, input_height); + wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + } T ele = pool_process.initial(); input_data += (batch_idx * channels + c) * input_depth * input_height * input_width; @@ -396,7 +444,7 @@ __global__ void KernelPool3D( } } } - int pool_size = exclusive + int pool_size = (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); @@ -413,7 +461,7 @@ __global__ void KernelPool3DGrad( const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, PoolProcess pool_process, - bool exclusive, T* input_grad) { + bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int offsetW = index % input_width + padding_width; @@ -423,18 +471,31 @@ __global__ void KernelPool3DGrad( int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; - int pdstart = (offsetD < ksize_depth) - ? 0 - : (offsetD - ksize_depth) / stride_depth + 1; - int phstart = (offsetH < ksize_height) - ? 0 - : (offsetH - ksize_height) / stride_height + 1; - int pwstart = (offsetW < ksize_width) - ? 0 - : (offsetW - ksize_width) / stride_width + 1; - int pdend = min((offsetD) / stride_depth + 1, output_depth); - int phend = min((offsetH) / stride_height + 1, output_height); - int pwend = min((offsetW) / stride_width + 1, output_width); + int pdstart, pdend; + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + pdstart = offsetD * output_depth / input_depth; + pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + pdstart = (offsetD < ksize_depth) + ? 0 + : (offsetD - ksize_depth) / stride_depth + 1; + phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + pdend = min((offsetD) / stride_depth + 1, output_depth); + phend = min((offsetH) / stride_height + 1, output_height); + pwend = min((offsetW) / stride_width + 1, output_width); + } T gradient = 0; T input = input_data[index]; @@ -447,18 +508,29 @@ __global__ void KernelPool3DGrad( for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - int pool_size = - exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + int pool_size; + if (adaptive) { + pool_size = + static_cast( + ceil(static_cast(input_depth) / ksize_depth)) * + static_cast( + ceil(static_cast(input_height) / ksize_height)) * + static_cast( + ceil(static_cast(input_width) / ksize_width)); + } else { + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + pool_size = + exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart) + : ksize_depth * ksize_height * ksize_width; + } int output_sub_idx = (pd * output_height + ph) * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], output_grad[output_sub_idx], @@ -533,7 +605,7 @@ class Pool3dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -567,7 +639,7 @@ class Pool3dFunctor { input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, padding_depth, padding_height, padding_width, pool_process, exclusive, - output_data); + adaptive, output_data); } }; @@ -586,7 +658,8 @@ class Pool3dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -622,7 +695,7 @@ class Pool3dGradFunctor { input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, input_grad_data); + padding_width, pool_process, exclusive, adaptive, input_grad_data); } }; @@ -711,7 +784,7 @@ __global__ void KernelMaxPool2dWithIdx( const int input_height, const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, - const int padding_width, T1* output_data, T2* mask_data) { + const int padding_width, bool adaptive, T1* output_data, T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -719,13 +792,23 @@ __global__ void KernelMaxPool2dWithIdx( int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); - hstart = max(hstart, 0); + int hstart, hend; + int wstart, wend; + if (adaptive) { + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); - wstart = max(wstart, 0); + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + hstart = ph * stride_height - padding_height; + hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + wstart = pw * stride_width - padding_width; + wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + } input_data += (batch_idx * channels + c) * input_height * input_width; T1 ele = -FLT_MAX; @@ -750,36 +833,46 @@ __global__ void KernelMaxPool2DWithIdxGrad( const int channels, const int input_height, const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, T1* input_grad) { + const int padding_height, const int padding_width, bool adaptive, + T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int w_offset = index % input_width; - int h_offset = (index / input_width) % input_height; - int c_offset = (index / input_width / input_height) % channels; + int offsetW = index % input_width; + int offsetH = (index / input_width) % input_height; + int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; - int ph_start = - (h_offset + padding_height < ksize_height) - ? 0 - : (h_offset + padding_height - ksize_height) / stride_height + 1; - int pw_start = - (w_offset + padding_width < ksize_width) - ? 0 - : (w_offset + padding_width - ksize_width) / stride_width + 1; - int ph_end = - min((h_offset + padding_height) / stride_height + 1, output_height); - int pw_end = - min((w_offset + padding_width) / stride_width + 1, output_width); + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + phstart = + (offsetH + padding_height < ksize_height) + ? 0 + : (offsetH + padding_height - ksize_height) / stride_height + 1; + pwstart = + (offsetW + padding_width < ksize_width) + ? 0 + : (offsetW + padding_width - ksize_width) / stride_width + 1; + phend = + min((offsetH + padding_height) / stride_height + 1, output_height); + pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + } T1 gradient = 0; - int input_current_featuremap_idx = h_offset * input_width + w_offset; + int input_current_featuremap_idx = offsetH * input_width + offsetW; int output_idx = - (batch_idx * channels + c_offset) * output_height * output_width; + (batch_idx * channels + offsetC) * output_height * output_width; mask_data += output_idx; output_grad += output_idx; - for (int ph = ph_start; ph < ph_end; ++ph) { - for (int pw = pw_start; pw < pw_end; ++pw) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { if (mask_data[ph * output_width + pw] == input_current_featuremap_idx) gradient += output_grad[ph * output_width + pw]; } @@ -799,8 +892,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -827,7 +920,8 @@ class MaxPool2dWithIndexFunctor { KernelMaxPool2dWithIdx<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, output_data, mask_data); + stride_width, padding_height, padding_width, adaptive, output_data, + mask_data); } }; @@ -843,7 +937,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -870,7 +964,7 @@ class MaxPool2dWithIndexGradFunctor { KernelMaxPool2DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, + stride_height, stride_width, padding_height, padding_width, adaptive, input_grad_data); } }; @@ -892,7 +986,7 @@ __global__ void KernelMaxPool3DWithIdx( const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, - T1* output_data, T2* mask_data) { + bool adaptive, T1* output_data, T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -902,15 +996,29 @@ __global__ void KernelMaxPool3DWithIdx( int batch_idx = index / output_width / output_height / output_depth / channels; - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); + int dstart, dend; + int hstart, hend; + int wstart, wend; + if (adaptive) { + dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); + dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + + hstart = ADAPT_START_INDEX(ph, input_height, output_height); + hend = ADAPT_END_INDEX(ph, input_height, output_height); + + wstart = ADAPT_START_INDEX(pw, input_width, output_width); + wend = ADAPT_END_INDEX(pw, input_width, output_width); + } else { + dstart = pd * stride_depth - padding_depth; + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + dend = min(dstart + ksize_depth, input_depth); + hend = min(hstart + ksize_height, input_height); + wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + } T1 ele = -FLT_MAX; int max_index = -1; @@ -940,46 +1048,56 @@ __global__ void KernelMaxPool3DWithIdxGrad( const int output_width, const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, T1* input_grad) { + const int padding_width, bool adaptive, T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int w_offset = index % input_width; - int h_offset = (index / input_width) % input_height; - int d_offset = (index / input_width / input_height) % input_depth; - int c_offset = - (index / input_width / input_height / input_depth) % channels; + int offsetW = index % input_width; + int offsetH = (index / input_width) % input_height; + int offsetD = (index / input_width / input_height) % input_depth; + int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; - int pd_start = - (d_offset + padding_depth < ksize_depth) - ? 0 - : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; - int ph_start = - (h_offset + padding_height < ksize_height) - ? 0 - : (h_offset + padding_height - ksize_height) / stride_height + 1; - int pw_start = - (w_offset + padding_width < ksize_width) - ? 0 - : (w_offset + padding_width - ksize_width) / stride_width + 1; - int pd_end = - min((d_offset + padding_depth) / stride_depth + 1, output_depth); - int ph_end = - min((h_offset + padding_height) / stride_height + 1, output_height); - int pw_end = - min((w_offset + padding_width) / stride_width + 1, output_width); + int pdstart, pdend; + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + pdstart = offsetD * output_depth / input_depth; + pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); + phstart = offsetH * output_height / input_height; + phend = + min((offsetH + 1) * output_height / input_height + 1, output_height); + pwstart = offsetW * output_width / input_width; + pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + } else { + pdstart = + (offsetD + padding_depth < ksize_depth) + ? 0 + : (offsetD + padding_depth - ksize_depth) / stride_depth + 1; + phstart = + (offsetH + padding_height < ksize_height) + ? 0 + : (offsetH + padding_height - ksize_height) / stride_height + 1; + pwstart = + (offsetW + padding_width < ksize_width) + ? 0 + : (offsetW + padding_width - ksize_width) / stride_width + 1; + pdend = min((offsetD + padding_depth) / stride_depth + 1, output_depth); + phend = + min((offsetH + padding_height) / stride_height + 1, output_height); + pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + } T1 gradient = 0; int input_current_feature_map_idx = - (d_offset * input_height + h_offset) * input_width + w_offset; - int output_idx = (batch_idx * channels + c_offset) * output_depth * + (offsetD * input_height + offsetH) * input_width + offsetW; + int output_idx = (batch_idx * channels + offsetC) * output_depth * output_height * output_width; mask += output_idx; output_grad += output_idx; - for (int pd = pd_start; pd < pd_end; ++pd) { - for (int ph = ph_start; ph < ph_end; ++ph) { - for (int pw = pw_start; pw < pw_end; ++pw) { + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { if (mask[(pd * output_height + ph) * output_width + pw] == input_current_feature_map_idx) gradient += @@ -1002,8 +1120,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1037,7 +1155,8 @@ class MaxPool3dWithIndexFunctor { nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, output_data, mask_data); + padding_depth, padding_height, padding_width, adaptive, output_data, + mask_data); } }; @@ -1053,7 +1172,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -1087,7 +1206,7 @@ class MaxPool3dWithIndexGradFunctor { nthreads, output_grad_data, mask_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, - stride_width, padding_depth, padding_height, padding_width, + stride_width, padding_depth, padding_height, padding_width, adaptive, input_grad_data); } }; diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 923babd4c2..d123af8924 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -102,7 +102,7 @@ class Pool2dFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* output); + bool exclusive, bool adaptive, framework::Tensor* output); }; template @@ -114,7 +114,7 @@ class Pool2dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* input_grad); + bool exclusive, bool adaptive, framework::Tensor* input_grad); }; template @@ -136,7 +136,7 @@ class Pool3dFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* output); + bool exclusive, bool adaptive, framework::Tensor* output); }; template @@ -148,7 +148,7 @@ class Pool3dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* input_grad); + bool exclusive, bool adaptive, framework::Tensor* input_grad); }; template @@ -176,8 +176,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask); + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask); }; template @@ -187,7 +187,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad); }; @@ -197,8 +197,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask); + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask); }; template @@ -208,7 +208,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad); }; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 52b607df74..11b5c49323 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -52,6 +52,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); bool ceil_mode = ctx->Attrs().Get("ceil_mode"); + bool adaptive = ctx->Attrs().Get("adaptive"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D tensor."); @@ -72,9 +73,13 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { "Paddings size and pooling size should be the same."); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i], ceil_mode)); + if (adaptive) { + output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); + } else { + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(PoolOutputSize( + in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode)); + } } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->ShareLoD("X", "Out"); @@ -186,6 +191,14 @@ void Pool2dOpMaker::Make() { "averaging calculating, otherwise, include the zero-padding. Note, it " "is only used when pooling_type is avg. The defalut is True.") .SetDefault(true); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); + AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") @@ -325,6 +338,13 @@ void Pool3dOpMaker::Make() { "averaging calculating, otherwise, include the zero-padding. Note, it " "is only used when pooling_type is avg. The defalut is True.") .SetDefault(true); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr( "use_cudnn", diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index c0594b7e3c..6c5900bd0f 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -70,6 +70,7 @@ class PoolKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -85,7 +86,7 @@ class PoolKernel : public framework::OpKernel { pool2d_forward; paddle::operators::math::MaxPool pool_process; pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - true, out); + true, false, out); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dFunctor< @@ -93,7 +94,7 @@ class PoolKernel : public framework::OpKernel { pool2d_forward; paddle::operators::math::AvgPool pool_process; pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - exclusive, out); + exclusive, adaptive, out); } } break; case 3: { @@ -103,14 +104,14 @@ class PoolKernel : public framework::OpKernel { pool3d_forward; paddle::operators::math::MaxPool pool_process; pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - true, out); + true, false, out); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dFunctor< DeviceContext, paddle::operators::math::AvgPool, T> pool3d_forward; paddle::operators::math::AvgPool pool_process; pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - exclusive, out); + exclusive, adaptive, out); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } @@ -133,6 +134,7 @@ class PoolGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { @@ -159,7 +161,8 @@ class PoolGradKernel : public framework::OpKernel { pool2d_backward; paddle::operators::math::AvgPoolGrad pool_process; pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, pool_process, exclusive, in_x_grad); + paddings, pool_process, exclusive, adaptive, + in_x_grad); } } break; case 3: { @@ -174,7 +177,8 @@ class PoolGradKernel : public framework::OpKernel { pool3d_backward; paddle::operators::math::AvgPoolGrad pool_process; pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, pool_process, exclusive, in_x_grad); + paddings, pool_process, exclusive, adaptive, + in_x_grad); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index 873706593e..f9e25277e5 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -40,6 +40,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); + bool adaptive = ctx->Attrs().Get("adaptive"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D tensor."); @@ -60,9 +61,13 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { "Paddings size and pooling size should be the same."); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); + if (adaptive) { + output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); + } else { + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Mask", framework::make_ddim(output_shape)); @@ -133,6 +138,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default:false) Whether to use the global pooling. " "If global_pooling = true, ksize and paddings will be ignored.") .SetDefault(false); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling " + "instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr>("strides", "(vector, default {1, 1}), strides(height, " "width) of pooling operator.") @@ -209,6 +222,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Whether to use the global pooling. " "If global_pooling = true, ksize and paddings will be ignored.") .SetDefault(false); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling " + "instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr>("strides", "(vector, default {1,1,1}), strides(depth, " "height, width) of pooling operator.") diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h index b55fa76eae..a6bec121d4 100644 --- a/paddle/fluid/operators/pool_with_index_op.h +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -36,6 +36,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + bool adaptive = context.Attr("adaptive"); auto& dev_ctx = context.template device_context(); if (context.Attr("global_pooling")) { @@ -50,13 +51,15 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { paddle::operators::math::MaxPool2dWithIndexFunctor pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, + mask); } break; case 3: { paddle::operators::math::MaxPool3dWithIndexFunctor pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, + mask); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } @@ -75,6 +78,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -93,14 +97,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { T1, T2> pool2d_backward; pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, in_x_grad); + paddings, adaptive, in_x_grad); } break; case 3: { paddle::operators::math::MaxPool3dWithIndexGradFunctor pool3d_backward; pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, in_x_grad); + paddings, adaptive, in_x_grad); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index 35d9737ee0..3c2d51ec91 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -56,13 +56,13 @@ class SppKernel : public framework::OpKernel { math::Pool2dFunctor, T> pool_forward; math::MaxPool max_process; pool_forward(context.template device_context(), *in_x, - kernel_size, strides, paddings, max_process, true, + kernel_size, strides, paddings, max_process, true, false, &out_level); } else if (pooling_type == "avg") { math::Pool2dFunctor, T> pool_forward; math::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, - kernel_size, strides, paddings, avg_process, true, + kernel_size, strides, paddings, avg_process, true, false, &out_level); } // flatten pooling output shape @@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel { math::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, - paddings, avg_process, true, in_x_grad); + paddings, avg_process, true, false, in_x_grad); } } } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e25eaaa9fd..61794f0d49 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -52,6 +52,8 @@ __all__ = [ 'softmax', 'pool2d', 'pool3d', + 'adaptive_pool2d', + 'adaptive_pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', @@ -2499,6 +2501,190 @@ def pool3d(input, return pool_out +@templatedoc(op_type="pool2d") +def adaptive_pool2d(input, + pool_size, + pool_type="max", + require_index=False, + use_cudnn=True, + name=None): + """ + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain two integers, (pool_size_Height, pool_size_Width). + pool_type: ${pooling_type_comment} + require_index (bool): If true, the index of max pooling point along with outputs. + it cannot be set in average pooling type. + use_cudnn (bool): ${use_cudnn_comment} + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The pooling result. + + Raises: + ValueError: 'pool_type' is not 'max' nor 'avg'. + ValueError: 'use_cudnn' is not a bool value. + ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. + ValueError: 'pool_size' should be a list or tuple with length as 2. + + Examples: + + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.pool2d( + input=data, + pool_size=[3, 3], + pool_type='max', + require_index=True) + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if pool_type == "avg" and require_index: + raise ValueError( + "invalid setting 'require_index' true when 'pool_type' is 'avg'.") + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2: + raise ValueError( + "'pool_size' should be a list or tuple with length as 2.") + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False.") + + if pool_type == "max": + l_type = 'max_pool2d_with_index' + else: + l_type = "pool2d" + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + + outputs = {"Out": pool_out} + if pool_type == "max": + mask = helper.create_variable_for_type_inference(dtype) + outputs["Mask"] = mask + + helper.append_op( + type=l_type, + inputs={"X": input}, + outputs=outputs, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "use_cudnn": use_cudnn, + "adaptive": True, + }) + + return pool_out + + +@templatedoc(op_type="pool3d") +def adaptive_pool3d(input, + pool_size, + pool_type="max", + require_index=False, + use_cudnn=True, + name=None): + """ + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain two integers, (Depth, Height, Width). + pool_type: ${pooling_type_comment} + require_index (bool): If true, the index of max pooling point along with outputs. + it cannot be set in average pooling type. + use_cudnn (bool): ${use_cudnn_comment} + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The pooling result. + + Raises: + ValueError: 'pool_type' is not 'max' nor 'avg'. + ValueError: 'use_cudnn' is not a bool value. + ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. + ValueError: 'pool_size' should be a list or tuple with length as 2. + + Examples: + + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.pool2d( + input=data, + pool_size=[3, 3], + pool_type='max', + require_index=True) + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if pool_type == "avg" and require_index: + raise ValueError( + "invalid setting 'require_index' true when 'pool_type' is 'avg'.") + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3: + raise ValueError( + "'pool_size' should be a list or tuple with length as 3.") + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False.") + + if pool_type == "max": + l_type = 'max_pool3d_with_index' + else: + l_type = "pool3d" + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + + outputs = {"Out": pool_out} + if pool_type == "max": + mask = helper.create_variable_for_type_inference(dtype) + outputs["Mask"] = mask + + helper.append_op( + type=l_type, + inputs={"X": input}, + outputs=outputs, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "use_cudnn": use_cudnn, + "adaptive": True, + }) + + return pool_out + + def batch_norm(input, act=None, is_test=False, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 10e8bb5a86..9785b5063c 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -233,6 +233,28 @@ class TestBook(unittest.TestCase): pool_stride=[1, 2], pool_padding=(2, 1))) + def test_adaptive_pool2d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') + self.assertIsNotNone( + layers.adaptive_pool2d( + x, [3, 3], require_index=True)) + self.assertIsNotNone( + layers.adaptive_pool2d( + x, [3, 3], pool_type='avg')) + + def test_adaptive_pool3d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32') + self.assertIsNotNone( + layers.adaptive_pool3d( + x, [3, 3, 3], require_index=True)) + self.assertIsNotNone( + layers.adaptive_pool3d( + x, [3, 3, 3], pool_type='avg')) + def test_lstm_unit(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index 47b2e71a4e..5ccdf082e8 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np @@ -21,29 +22,47 @@ import paddle.fluid.core as core from op_test import OpTest +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, H, W = x.shape if global_pool == 1: ksize = [H, W] - H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] out[:, :, i, j] = np.max(x_masked, axis=(2, 3)) @@ -56,27 +75,37 @@ def avg_pool2D_forward_naive(x, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, H, W = x.shape if global_pool == 1: ksize = [H, W] - H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] - field_size = ((r_end - r_start) * (c_end - c_start)) if exclusive \ - else (ksize[0] * ksize[1]) + field_size = ((r_end - r_start) * (c_end - c_start)) \ + if (exclusive or adaptive) else (ksize[0] * ksize[1]) out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size return out @@ -93,12 +122,13 @@ class TestPool2D_Op(OpTest): self.init_pool_type() self.init_ceil_mode() self.init_exclusive() + self.init_adaptive() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] input = np.random.random(self.shape).astype(self.dtype) output = self.pool2D_forward_naive( input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode, self.exclusive).astype(self.dtype) + self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { @@ -112,7 +142,8 @@ class TestPool2D_Op(OpTest): 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout', # TODO(dzhwinter) : should be fix latter - 'exclusive': self.exclusive + 'exclusive': self.exclusive, + 'adaptive': self.adaptive } self.outputs = {'Out': output} @@ -159,6 +190,9 @@ class TestPool2D_Op(OpTest): def init_exclusive(self): self.exclusive = True + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestPool2D_Op): def init_test_case(self): @@ -315,5 +349,10 @@ class TestCUDNNAvgInclude(TestCase2): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py index f05f8ccb39..47a5b2d1ab 100644 --- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np @@ -21,35 +22,59 @@ import paddle.fluid.core as core from op_test import OpTest +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, D, H, W = x.shape if global_pool == 1: ksize = [D, H, W] - D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 - ) // strides[2] + 1 if ceil_mode else ( - W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 + ) // strides[2] + 1 if ceil_mode else ( + W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4)) @@ -62,33 +87,49 @@ def avg_pool3D_forward_naive(x, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, D, H, W = x.shape if global_pool == 1: ksize = [D, H, W] - D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 - ) // strides[2] + 1 if ceil_mode else ( - W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 + ) // strides[2] + 1 if ceil_mode else ( + W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \ - if exclusive else ksize[0] * ksize[1] * ksize[2] + if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2] out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / field_size return out @@ -105,13 +146,14 @@ class TestPool3d_Op(OpTest): self.init_pool_type() self.init_ceil_mode() self.init_exclusive() + self.init_adaptive() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] input = np.random.random(self.shape).astype(self.dtype) output = self.pool3D_forward_naive( input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode, self.exclusive).astype(self.dtype) + self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { @@ -124,7 +166,8 @@ class TestPool3d_Op(OpTest): 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout', # TODO(dzhwinter) : should be fix latter - 'exclusive': self.exclusive + 'exclusive': self.exclusive, + 'adaptive': self.adaptive } self.outputs = {'Out': output} @@ -171,6 +214,9 @@ class TestPool3d_Op(OpTest): def init_exclusive(self): self.exclusive = True + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestPool3d_Op): def init_test_case(self): @@ -353,5 +399,10 @@ class TestCUDNNAvgInclude(TestCUDNNCase3): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py index 488ff431d4..6575c408ee 100644 --- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py @@ -13,33 +13,62 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np from op_test import OpTest -def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False): +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + +def max_pool3D_forward_naive(x, + ksize, + strides, + paddings, + global_pool=False, + adaptive=False): N, C, D, H, W = x.shape if global_pool: ksize = [D, H, W] paddings = [0, 0, 0] - D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) mask = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4)) @@ -58,23 +87,37 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False): return out, mask -def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False): +def max_pool2D_forward_naive(x, + ksize, + strides, + paddings, + global_pool=False, + adaptive=False): N, C, H, W = x.shape if global_pool: ksize = [H, W] paddings = [0, 0] - H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) mask = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] out[:, :, i, j] = np.max(x_masked, axis=(2, 3)) @@ -95,10 +138,12 @@ class TestMaxPoolWithIndex_Op(OpTest): def setUp(self): self.init_test_case() self.init_global() + self.init_adaptive() input = np.random.random(self.shape).astype("float32") output, mask = self.pool_forward_naive(input, self.ksize, self.strides, - self.paddings, self.global_pool) + self.paddings, self.global_pool, + self.adaptive) output = output.astype("float32") mask = mask.astype("int32") @@ -107,6 +152,7 @@ class TestMaxPoolWithIndex_Op(OpTest): 'paddings': self.paddings, 'ksize': self.ksize, 'global_pooling': self.global_pool, + 'adaptive': self.adaptive, } self.inputs = {'X': input} @@ -129,6 +175,9 @@ class TestMaxPoolWithIndex_Op(OpTest): def init_global(self): self.global_pool = False + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestMaxPoolWithIndex_Op): def init_global(self): @@ -190,5 +239,15 @@ class TestCase7(TestCase6): self.global_pool = False +class TestCastAdaptive2d(TestCase6): + def init_adaptive(self): + self.adaptive = True + + +class TestCastAdaptive3d(TestMaxPoolWithIndex_Op): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() From cf06e50f1d2b4ceca197e41c2c17a71783c5bc04 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 3 Dec 2018 20:04:49 +0800 Subject: [PATCH 226/719] add doc for adaptive pool. test=develop --- paddle/fluid/operators/pool_op.cc | 39 ++++++++++++++++++++ paddle/fluid/operators/pool_with_index_op.cc | 11 ++++++ 2 files changed, 50 insertions(+) diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 11b5c49323..a2f5f811ab 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -277,6 +277,14 @@ Example: Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} $$ + For adaptive = true: + $$ + hstart = floor(i * H_{in} / H_{out}) + hend = ceil((i + 1) * H_{in} / H_{out}) + wstart = floor(j * W_{in} / W_{out}) + wend = ceil((j + 1) * W_{in} / W_{out}) + Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} + $$ )DOC"); } @@ -396,6 +404,37 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ + For exclusive = true: + $$ + dstart = i * strides[0] - paddings[0] + dend = dstart + ksize[0] + hstart = j * strides[1] - paddings[1] + hend = hstart + ksize[1] + wstart = k * strides[2] - paddings[2] + wend = wstart + ksize[2] + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} + $$ + For exclusive = false: + $$ + dstart = max(0, i * strides[0] - paddings[0]) + dend = min(D, dstart + ksize[0]) + hstart = max(0, j * strides[1] - paddings[1]) + hend = min(H, hstart + ksize[1]) + wstart = max(0, k * strides[2] - paddings[2]) + wend = min(W, wstart + ksize[2]) + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + $$ + + For adaptive = true: + $$ + dstart = floor(i * D_{in} / D_{out}) + dend = ceil((i + 1) * D_{in} / D_{out}) + hstart = floor(j * H_{in} / H_{out}) + hend = ceil((j + 1) * H_{in} / H_{out}) + wstart = floor(k * W_{in} / W_{out}) + wend = ceil((k + 1) * W_{in} / W_{out}) + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + $$ )DOC"); } diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index f9e25277e5..5354b485bd 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -182,6 +182,12 @@ Example: H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 $$ + + For adaptive = true: + $$ + H_{out} = ksize[0] W_{out} = ksize[1] + $$ + )DOC"); } @@ -267,6 +273,11 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 $$ + + For adaptive = true: + $$ + D_{out} = ksize[0] H_{out} = ksize[1] W_{out} = ksize[2] + $$ )DOC"); } From a81fabd3273ae0cba9988da612dd5241aeec823f Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 11 Dec 2018 13:59:56 +0800 Subject: [PATCH 227/719] fix doc errors. test=develop --- paddle/fluid/operators/math/pooling.cc | 70 +++---- paddle/fluid/operators/math/pooling.cu | 182 +++++++++--------- paddle/fluid/operators/math/pooling.h | 12 ++ python/paddle/fluid/layers/nn.py | 26 ++- .../fluid/tests/unittests/test_layers.py | 13 +- 5 files changed, 154 insertions(+), 149 deletions(-) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index b4ee82add3..30873e9f87 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -19,16 +19,6 @@ namespace paddle { namespace operators { namespace math { -static inline int ADAPT_START_INDEX(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -static inline int ADAPT_END_INDEX(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - /* * All tensors are in NCHW format. * Ksize, strides, paddings are two elements. These two elements represent @@ -67,8 +57,8 @@ class Pool2dFunctor { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -76,8 +66,8 @@ class Pool2dFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -144,8 +134,8 @@ class Pool2dGradFunctor { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -153,8 +143,8 @@ class Pool2dGradFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -319,8 +309,8 @@ class Pool3dFunctor { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); } else { dstart = pd * stride_depth - padding_depth; dend = std::min(dstart + ksize_depth, input_depth); @@ -328,8 +318,8 @@ class Pool3dFunctor { } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -337,8 +327,8 @@ class Pool3dFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -417,8 +407,8 @@ class Pool3dGradFunctor { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); } else { dstart = pd * stride_depth - padding_depth; dend = std::min(dstart + ksize_depth, input_depth); @@ -426,8 +416,8 @@ class Pool3dGradFunctor { } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -435,8 +425,8 @@ class Pool3dGradFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -615,8 +605,8 @@ class MaxPool2dWithIndexFunctor { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -624,8 +614,8 @@ class MaxPool2dWithIndexFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); @@ -753,8 +743,8 @@ class MaxPool3dWithIndexFunctor { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); } else { dstart = pd * stride_depth - padding_depth; dend = std::min(dstart + ksize_depth, input_depth); @@ -762,8 +752,8 @@ class MaxPool3dWithIndexFunctor { } for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); } else { hstart = ph * stride_height - padding_height; hend = std::min(hstart + ksize_height, input_height); @@ -771,8 +761,8 @@ class MaxPool3dWithIndexFunctor { } for (int pw = 0; pw < output_width; ++pw) { if (adaptive) { - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { wstart = pw * stride_width - padding_width; wend = std::min(wstart + ksize_width, input_width); diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 5f3b82ed55..efce3f899a 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -21,18 +21,6 @@ namespace paddle { namespace operators { namespace math { -__device__ __forceinline__ int ADAPT_START_INDEX(int ph, int input_size, - int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -__device__ __forceinline__ int ADAPT_END_INDEX(int ph, int input_size, - int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - template __global__ void KernelPool2D(const int nthreads, const T* input_data, const int channels, const int input_height, @@ -52,11 +40,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, int hstart, hend; int wstart, wend; if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { hstart = ph * stride_height - padding_height; hend = min(hstart + ksize_height, input_height); @@ -91,28 +79,29 @@ __global__ void KernelPool2DGrad( PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width + padding_width; - int offsetH = (index / input_width) % input_height + padding_height; + int w_offset = index % input_width + padding_width; + int h_offset = (index / input_width) % input_height + padding_height; int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; int phstart, phend; int pwstart, pwend; if (adaptive) { - phstart = offsetH * output_height / input_height; + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { - phstart = (offsetH < ksize_height) + phstart = (h_offset < ksize_height) ? 0 - : (offsetH - ksize_height) / stride_height + 1; - pwstart = (offsetW < ksize_width) + : (h_offset - ksize_height) / stride_height + 1; + pwstart = (w_offset < ksize_width) ? 0 - : (offsetW - ksize_width) / stride_width + 1; - phend = min(offsetH / stride_height + 1, output_height); - pwend = min(offsetW / stride_width + 1, output_width); + : (w_offset - ksize_width) / stride_width + 1; + phend = min(h_offset / stride_height + 1, output_height); + pwend = min(w_offset / stride_width + 1, output_width); } T gradient = 0; T input = input_data[index]; @@ -414,14 +403,14 @@ __global__ void KernelPool3D( int hstart, hend; int wstart, wend; if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { dstart = pd * stride_depth - padding_depth; hstart = ph * stride_height - padding_height; @@ -464,9 +453,9 @@ __global__ void KernelPool3DGrad( bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width + padding_width; - int offsetH = (index / input_width) % input_height + padding_height; - int offsetD = + int w_offset = index % input_width + padding_width; + int h_offset = (index / input_width) % input_height + padding_height; + int d_offset = (index / input_width / input_height) % input_depth + padding_depth; int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; @@ -475,26 +464,28 @@ __global__ void KernelPool3DGrad( int phstart, phend; int pwstart, pwend; if (adaptive) { - pdstart = offsetD * output_depth / input_depth; - pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); - phstart = offsetH * output_height / input_height; + pdstart = d_offset * output_depth / input_depth; + pdend = + min((d_offset + 1) * output_depth / input_depth + 1, output_depth); + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { - pdstart = (offsetD < ksize_depth) + pdstart = (d_offset < ksize_depth) ? 0 - : (offsetD - ksize_depth) / stride_depth + 1; - phstart = (offsetH < ksize_height) + : (d_offset - ksize_depth) / stride_depth + 1; + phstart = (h_offset < ksize_height) ? 0 - : (offsetH - ksize_height) / stride_height + 1; - pwstart = (offsetW < ksize_width) + : (h_offset - ksize_height) / stride_height + 1; + pwstart = (w_offset < ksize_width) ? 0 - : (offsetW - ksize_width) / stride_width + 1; - pdend = min((offsetD) / stride_depth + 1, output_depth); - phend = min((offsetH) / stride_height + 1, output_height); - pwend = min((offsetW) / stride_width + 1, output_width); + : (w_offset - ksize_width) / stride_width + 1; + pdend = min((d_offset) / stride_depth + 1, output_depth); + phend = min((h_offset) / stride_height + 1, output_height); + pwend = min((w_offset) / stride_width + 1, output_width); } T gradient = 0; @@ -795,11 +786,11 @@ __global__ void KernelMaxPool2dWithIdx( int hstart, hend; int wstart, wend; if (adaptive) { - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { hstart = ph * stride_height - padding_height; hend = min(hstart + ksize_height, input_height); @@ -837,35 +828,36 @@ __global__ void KernelMaxPool2DWithIdxGrad( T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width; - int offsetH = (index / input_width) % input_height; + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; int phstart, phend; int pwstart, pwend; if (adaptive) { - phstart = offsetH * output_height / input_height; + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { phstart = - (offsetH + padding_height < ksize_height) + (h_offset + padding_height < ksize_height) ? 0 - : (offsetH + padding_height - ksize_height) / stride_height + 1; + : (h_offset + padding_height - ksize_height) / stride_height + 1; pwstart = - (offsetW + padding_width < ksize_width) + (w_offset + padding_width < ksize_width) ? 0 - : (offsetW + padding_width - ksize_width) / stride_width + 1; + : (w_offset + padding_width - ksize_width) / stride_width + 1; phend = - min((offsetH + padding_height) / stride_height + 1, output_height); - pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + min((h_offset + padding_height) / stride_height + 1, output_height); + pwend = min((w_offset + padding_width) / stride_width + 1, output_width); } T1 gradient = 0; - int input_current_featuremap_idx = offsetH * input_width + offsetW; + int input_current_featuremap_idx = h_offset * input_width + w_offset; int output_idx = (batch_idx * channels + offsetC) * output_height * output_width; @@ -1000,14 +992,14 @@ __global__ void KernelMaxPool3DWithIdx( int hstart, hend; int wstart, wend; if (adaptive) { - dstart = ADAPT_START_INDEX(pd, input_depth, output_depth); - dend = ADAPT_END_INDEX(pd, input_depth, output_depth); + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); - hstart = ADAPT_START_INDEX(ph, input_height, output_height); - hend = ADAPT_END_INDEX(ph, input_height, output_height); + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); - wstart = ADAPT_START_INDEX(pw, input_width, output_width); - wend = ADAPT_END_INDEX(pw, input_width, output_width); + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); } else { dstart = pd * stride_depth - padding_depth; hstart = ph * stride_height - padding_height; @@ -1051,9 +1043,9 @@ __global__ void KernelMaxPool3DWithIdxGrad( const int padding_width, bool adaptive, T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width; - int offsetH = (index / input_width) % input_height; - int offsetD = (index / input_width / input_height) % input_depth; + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; + int d_offset = (index / input_width / input_height) % input_depth; int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; @@ -1061,35 +1053,37 @@ __global__ void KernelMaxPool3DWithIdxGrad( int phstart, phend; int pwstart, pwend; if (adaptive) { - pdstart = offsetD * output_depth / input_depth; - pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth); - phstart = offsetH * output_height / input_height; + pdstart = d_offset * output_depth / input_depth; + pdend = + min((d_offset + 1) * output_depth / input_depth + 1, output_depth); + phstart = h_offset * output_height / input_height; phend = - min((offsetH + 1) * output_height / input_height + 1, output_height); - pwstart = offsetW * output_width / input_width; - pwend = min((offsetW + 1) * output_width / input_width + 1, output_width); + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); } else { pdstart = - (offsetD + padding_depth < ksize_depth) + (d_offset + padding_depth < ksize_depth) ? 0 - : (offsetD + padding_depth - ksize_depth) / stride_depth + 1; + : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; phstart = - (offsetH + padding_height < ksize_height) + (h_offset + padding_height < ksize_height) ? 0 - : (offsetH + padding_height - ksize_height) / stride_height + 1; + : (h_offset + padding_height - ksize_height) / stride_height + 1; pwstart = - (offsetW + padding_width < ksize_width) + (w_offset + padding_width < ksize_width) ? 0 - : (offsetW + padding_width - ksize_width) / stride_width + 1; - pdend = min((offsetD + padding_depth) / stride_depth + 1, output_depth); + : (w_offset + padding_width - ksize_width) / stride_width + 1; + pdend = min((d_offset + padding_depth) / stride_depth + 1, output_depth); phend = - min((offsetH + padding_height) / stride_height + 1, output_height); - pwend = min((offsetW + padding_width) / stride_width + 1, output_width); + min((h_offset + padding_height) / stride_height + 1, output_height); + pwend = min((w_offset + padding_width) / stride_width + 1, output_width); } T1 gradient = 0; int input_current_feature_map_idx = - (offsetD * input_height + offsetH) * input_width + offsetW; + (d_offset * input_height + h_offset) * input_width + w_offset; int output_idx = (batch_idx * channels + offsetC) * output_depth * output_height * output_width; mask += output_idx; diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index d123af8924..e1f8e6df1d 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -68,6 +68,18 @@ class AvgPoolGrad { } }; +/* used for adaptive pool to calculate start and end index of each divided grid + */ +HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + /* * \brief Getting pooling results, and calculating gradient. * diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 61794f0d49..07fc4ccc6b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2506,7 +2506,7 @@ def adaptive_pool2d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=True, + use_cudnn=False, name=None): """ ${comment} @@ -2521,7 +2521,7 @@ def adaptive_pool2d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool): ${use_cudnn_comment} + use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2531,6 +2531,7 @@ def adaptive_pool2d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. ValueError: 'use_cudnn' is not a bool value. + ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2540,11 +2541,11 @@ def adaptive_pool2d(input, data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') - conv2d = fluid.layers.pool2d( + pool_out = fluid.layers.adaptive_pool2d( input=data, pool_size=[3, 3], pool_type='max', - require_index=True) + require_index=False) """ if pool_type not in ["max", "avg"]: raise ValueError( @@ -2565,6 +2566,9 @@ def adaptive_pool2d(input, if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False.") + if use_cudnn: + raise ValueError("adaptive pool currently not supported in cudnn.") + if pool_type == "max": l_type = 'max_pool2d_with_index' else: @@ -2590,7 +2594,7 @@ def adaptive_pool2d(input, "adaptive": True, }) - return pool_out + return (pool_out, mask) if require_index else pool_out @templatedoc(op_type="pool3d") @@ -2598,7 +2602,7 @@ def adaptive_pool3d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=True, + use_cudnn=False, name=None): """ ${comment} @@ -2613,7 +2617,7 @@ def adaptive_pool3d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool): ${use_cudnn_comment} + use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2623,6 +2627,7 @@ def adaptive_pool3d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. ValueError: 'use_cudnn' is not a bool value. + ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2632,7 +2637,7 @@ def adaptive_pool3d(input, data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') - conv2d = fluid.layers.pool2d( + pool_out, mask = fluid.layers.adaptive_pool3d( input=data, pool_size=[3, 3], pool_type='max', @@ -2657,6 +2662,9 @@ def adaptive_pool3d(input, if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False.") + if use_cudnn: + raise ValueError("adaptive pool currently not supported in cudnn.") + if pool_type == "max": l_type = 'max_pool3d_with_index' else: @@ -2682,7 +2690,7 @@ def adaptive_pool3d(input, "adaptive": True, }) - return pool_out + return (pool_out, mask) if require_index else pool_out def batch_norm(input, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 9785b5063c..030bf012fa 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -237,23 +237,24 @@ class TestBook(unittest.TestCase): program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') - self.assertIsNotNone( - layers.adaptive_pool2d( - x, [3, 3], require_index=True)) self.assertIsNotNone( layers.adaptive_pool2d( x, [3, 3], pool_type='avg')) + pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_adaptive_pool3d(self): program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32') - self.assertIsNotNone( - layers.adaptive_pool3d( - x, [3, 3, 3], require_index=True)) self.assertIsNotNone( layers.adaptive_pool3d( x, [3, 3, 3], pool_type='avg')) + pool, mask = layers.adaptive_pool3d( + x, [3, 3, 3], require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_lstm_unit(self): program = Program() From 1870262ba9a1f951e5181b7bbee1286d5bfa9dd3 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 11 Dec 2018 14:03:04 +0800 Subject: [PATCH 228/719] pserver should crash early whe has problem test=develop --- paddle/fluid/operators/distributed/brpc_client.cc | 2 +- paddle/fluid/operators/distributed/grpc_client.cc | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc index b394c678fb..350969f74b 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -158,7 +158,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { std::shared_ptr c(new ChannelContext()); if (c->channel.Init(ep.c_str(), &options) != 0) { - LOG(ERROR) << "Fail to initialize channel"; + LOG(FATAL) << "Fail to initialize channel"; return nullptr; } diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 857214aa21..f14dfcdb23 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -390,8 +390,7 @@ void GRPCClient::Proceed() { VLOG(3) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { - // FIXME(gongwb): parse error_details? - LOG(ERROR) << c->GetVarHandlePtr()->String() + LOG(FATAL) << c->GetVarHandlePtr()->String() << " meets grpc error, error_code:" << c->status_.error_code() << " error_message:" << c->status_.error_message() << " error_details:" << c->status_.error_details(); From 5e609069966e21e9db89d1dea80bf22b0d766e6a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 14:22:22 +0800 Subject: [PATCH 229/719] Fix compile error test=develop --- paddle/fluid/platform/cuda_helper_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index ee45afab93..466bf90c63 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -93,7 +93,7 @@ TEST(CudaAtomic, float16) { // unalignment of uint8 void TestUnalign(size_t num, const int shift_bit) { - PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2"); + ASSERT_EQ(num % 2, 0); float16 *in1, *in2, *out; float16 *d_in1, *d_in2; size_t size = sizeof(uint8_t) * (num + shift_bit); From 729684007d70cad38e9d34317748e3fedd477886 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 14:44:02 +0800 Subject: [PATCH 230/719] stop server out of run from file --- python/paddle/fluid/async_executor.py | 14 +++++++------- python/paddle/fluid/distributed/ps_instance.py | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 76fdb5b0e2..787a6a6b9e 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -86,7 +86,7 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) - self.instance = ps_instance.PaddlePSInstance("init_param", 1, 2) + self.instance = ps_instance.PaddlePSInstance(1, 2) def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): """ @@ -151,10 +151,7 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, fetch_var_names, debug) - self.instance.barrier_all() #worker do all things - if self.instance.is_first_worker(): - self.executor.stop_server() - self.instance.barrier_all() #sync + def config_distributed_nodes(self, dist_opt): @@ -167,8 +164,11 @@ class AsyncExecutor(object): def get_instance(self): return self.instance - #def stop_server(self): - # self.executor.stop_server() + def stop_server(self): + self.instance.barrier_all() #worker do all things + if self.instance.is_first_worker(): + self.executor.stop_server() + self.instance.barrier_all() #sync def init_server(self, dist_desc): self.executor.init_server(dist_desc, self.instance._rankid) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index b4045327e1..94e123c2ce 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -5,9 +5,8 @@ import sys class PaddlePSInstance(object): - def __init__(self, init_param, server_worker_mode, proc_per_node): + def __init__(self, server_worker_mode, proc_per_node): self.dh = dist_helper.MPIHelper() - self._config = init_param self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node From 900c789a35798cf73b67a2bb7b7944f3110c7bda Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 10 Dec 2018 12:00:46 +0000 Subject: [PATCH 231/719] use jitcode and use vmul --- paddle/fluid/operators/jit/gen/blas.cc | 26 ++++++++---- paddle/fluid/operators/jit/gen/blas.h | 22 +++++----- paddle/fluid/operators/jit/gen/jitcode.cc | 19 +-------- paddle/fluid/operators/jit/gen/jitcode.h | 10 ++--- paddle/fluid/operators/jit/gen_base.cc | 5 +++ paddle/fluid/operators/jit/gen_base.h | 51 +++++++++++++---------- paddle/fluid/operators/jit/kernel_pool.cc | 5 +++ paddle/fluid/operators/jit/kernel_pool.h | 49 ++++++++++++++++++---- paddle/fluid/operators/jit/registry.h | 25 ++++++++++- paddle/fluid/operators/jit/test.cc | 1 + 10 files changed, 137 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 4a8b4554c8..3e5ce54064 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/blas.h" #include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -103,17 +104,24 @@ void VXXJitCode::genCode() { ret(); } -} // namespace gen - -template <> -std::unique_ptr CreateJitCode(int attr) { - if (UseJitCode(attr)) { - return make_unique( - attr, CodeSize(attr)); +class VMulCreator : public JitCodeCreator { + public: + bool UseMe(const int& attr) const override { + return platform::MayIUse(platform::avx); } - return nullptr; -} + size_t CodeSize(const int& d) const override { + return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; + } + std::unique_ptr CreateJitCode(const int& attr) const override { + return make_unique(attr, CodeSize(attr)); + } +}; +} // namespace gen } // namespace jit } // namespace operators } // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(vmul, gen::VMulCreator); diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index edc05f86a0..60f3280567 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -25,7 +25,18 @@ namespace gen { // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) class VXXJitCode : public JitCode { public: - const char* name() const override { + explicit VXXJitCode(int d, operand_type type, int scalar_index, + bool with_relu, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), + num_(d), + type_(type), + scalar_index_(scalar_index), + with_relu_(with_relu) { + this->genCode(); + } + + virtual const char* name() const { std::string base = "VXXJitCode"; if (scalar_index_ == 1) { base += "_Scalar"; @@ -45,15 +56,6 @@ class VXXJitCode : public JitCode { base += (with_relu_ ? "_Relu" : ""); return base.c_str(); } - explicit VXXJitCode(int d, operand_type type, int scalar_index, - bool with_relu, size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), - num_(d), - type_(type), - scalar_index_(scalar_index), - with_relu_(with_relu) {} - // static bool init(int d, int scalar_index = 0); void genCode() override; private: diff --git a/paddle/fluid/operators/jit/gen/jitcode.cc b/paddle/fluid/operators/jit/gen/jitcode.cc index 93204d340e..7aaf6a2ff6 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.cc +++ b/paddle/fluid/operators/jit/gen/jitcode.cc @@ -16,23 +16,6 @@ namespace paddle { namespace operators { -namespace jit { - -template <> -size_t GetKey(int d) { - return d; -} - -// template <> -// std::shared_ptr CreateJitCode(int attr) -// { -// if (UseJitCode(attr)) { -// return std::make_shared>(attr, -// CodeSize(attr))); -// } -// return nullptr; -// } - -} // namespace jit +namespace jit {} // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 52b8da9a82..caa3ef9dda 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -70,9 +70,10 @@ typedef enum { class JitCode : public GenBase, public Xbyak::CodeGenerator { public: explicit JitCode(size_t code_size, void* code_ptr = nullptr) - : Xbyak::CodeGenerator(code_size, code_ptr) { - this->genCode(); - } + : Xbyak::CodeGenerator(code_size, code_ptr) {} + + virtual const char* name() const = 0; + virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } const unsigned char* getCodeInternal() override { @@ -80,9 +81,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { return code; } - virtual const char* name() const = 0; - virtual void genCode() = 0; - protected: Xbyak::Reg64 param1{abi_param1}; const int EVEX_max_8b_offt = 0x200; diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 310da0c76f..a8bf902963 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -23,6 +23,11 @@ namespace paddle { namespace operators { namespace jit { +template <> +size_t JitCodeKey(int d) { + return d; +} + // refer do not need useme, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 4a136534dc..3b874cf2b0 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -15,9 +15,8 @@ #pragma once #include -#include // for shared_ptr +#include // for unique_ptr #include "paddle/fluid/operators/jit/kernel_base.h" -#include "paddle/fluid/platform/macros.h" DECLARE_bool(dump_jitcode); @@ -25,29 +24,12 @@ namespace paddle { namespace operators { namespace jit { -// TODO(TJ): make these functions as virtual of a class - -// Every JitCode should estimate the code size itself -template -size_t CodeSize(Attr attr) { - return 4096; -} - -// Every JitCode should have a condition when to use this JitCode -template -bool UseJitCode(Attr attr) { - return false; -} - -// Every JitCode should have a method to get the key from attribution -template -size_t GetKey(Attr attr); - class GenBase : public Kernel { public: + virtual ~GenBase() = default; virtual const char* name() const = 0; - virtual const unsigned char* getCodeInternal() = 0; virtual size_t getSize() const = 0; + virtual const unsigned char* getCodeInternal() = 0; template const FUNC getCode() { const unsigned char* code = this->getCodeInternal(); @@ -61,8 +43,31 @@ class GenBase : public Kernel { void dumpCode(const unsigned char* code) const; }; -template -std::unique_ptr CreateJitCode(Attr attr); +// Every JitCode should have a method to get the key from attribution +template +size_t JitCodeKey(Attr attr); + +// Creator is used to creat the jitcode and save in pool. +// Every JitCode should have one creator. +class GenCreator { + public: + virtual ~GenCreator() = default; +}; + +template +class JitCodeCreator : public GenCreator { + public: + virtual ~JitCodeCreator() = default; + + // condition when this jit code can be used. + virtual bool UseMe(const Attr& attr) const = 0; + + // estimate this code size + virtual size_t CodeSize(const Attr& attr) const = 0; + + // create this code + virtual std::unique_ptr CreateJitCode(const Attr& attr) const = 0; +}; } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc index f300d28a6f..bc98c644fb 100644 --- a/paddle/fluid/operators/jit/kernel_pool.cc +++ b/paddle/fluid/operators/jit/kernel_pool.cc @@ -21,6 +21,11 @@ namespace paddle { namespace operators { namespace jit { +JitCodeCreatorPool& JitCodeCreatorPool::Instance() { + static JitCodeCreatorPool g_creator_pool; + return g_creator_pool; +} + KernelPool& KernelPool::Instance() { static KernelPool g_kernel_pool; return g_kernel_pool; diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index 737b7f60e3..c9e7fc84e5 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -14,7 +14,7 @@ #pragma once -#include // for shared_ptr +#include // for unique_ptr #include #include #include @@ -52,6 +52,28 @@ class JitCodePool { DISABLE_COPY_AND_ASSIGN(JitCodePool); }; +class JitCodeCreatorPool { + typedef std::unique_ptr GenCreatorPtr; + typedef std::unordered_map, + KernelKey::Hash> + GenCreatorPtrMap; + + public: + JitCodeCreatorPool() = default; + static JitCodeCreatorPool& Instance(); + GenCreatorPtrMap& AllCreators() { return creators_; } + void Insert(const KernelKey& key, GenCreatorPtr value) { + if (creators_.find(key) == creators_.end()) { + creators_.emplace(key, std::vector()); + } + creators_.at(key).emplace_back(std::move(value)); + } + + private: + GenCreatorPtrMap creators_; + DISABLE_COPY_AND_ASSIGN(JitCodeCreatorPool); +}; + typedef std::unique_ptr KernelPtr; typedef std::unordered_map, KernelKey::Hash> KernelMap; @@ -113,24 +135,33 @@ inline Func GetRefer() { template const Func Get(Attr attr) { - size_t key = GetKey(attr); + size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key)->template getCode(); } + KernelKey kkey(KT, PlaceType()); if (std::is_same::value) { - auto p = CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; + // pool: (KernelKey(type, place), vector) + auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto iter = creator_map.find(kkey); + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } + } } } - // pool: (KernelKey(type, place), vector) + // pool: (KernelKey(type, place), vector) auto& pool = KernelPool().Instance().AllKernels(); - KernelKey kkey(KT, PlaceType()); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index c1f02d9cd5..cb32c48720 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -116,7 +116,30 @@ class JitKernelRegistrar { #define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) -// REGISTER_JITKERNEL_JITCODE(vmul, JitKernelCode); +#define REGISTER_JITKERNEL_GEN(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ + "REGISTER_JITKERNEL_GEN must be called in global namespace"); \ + extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \ + TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static ::paddle::operators::jit::JitKernelRegistrar< \ + ::paddle::operators::jit::JitCodeCreatorPool, \ + ::paddle::platform::CPUPlace, __VA_ARGS__> \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \ + ::paddle::operators::jit::KernelType::kernel_type); \ + int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \ + return 0; \ + } + +#define USE_JITKERNEL_GEN(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ + "USE_JITKERNEL_GEN must be called in global namespace"); \ + extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \ + static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \ + TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() #define USE_JITKERNEL_REFER(kernel_type) \ STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 836b6eee80..5af9ed697d 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -61,6 +61,7 @@ void ExpectEQ(const T* target, const T* refer, int n) { // TODO(TJ): remove me USE_JITKERNEL_MORE(vmul, mkl); USE_JITKERNEL_REFER(vmul); +USE_JITKERNEL_GEN(vmul); TEST(JitKernel, vmul) { using T = float; From d4cab7d94890c3bf43d20243ea9f21722b2738a3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Dec 2018 07:30:53 +0000 Subject: [PATCH 232/719] use jitkernel in one file --- paddle/fluid/operators/jit/CMakeLists.txt | 13 +-- paddle/fluid/operators/jit/gen/CMakeLists.txt | 7 ++ paddle/fluid/operators/jit/gen/jitcode.cc | 21 ---- paddle/fluid/operators/jit/helper.h | 96 +++++++++++++++++++ paddle/fluid/operators/jit/kernel_base.h | 2 +- paddle/fluid/operators/jit/kernel_pool.h | 63 ------------ .../fluid/operators/jit/more/CMakeLists.txt | 3 + .../operators/jit/more/mkl/CMakeLists.txt | 3 + paddle/fluid/operators/jit/more/mkl/mkl.h | 4 +- .../fluid/operators/jit/refer/CMakeLists.txt | 7 ++ paddle/fluid/operators/jit/refer/refer.h | 4 +- paddle/fluid/operators/jit/test.cc | 18 +--- 12 files changed, 133 insertions(+), 108 deletions(-) delete mode 100644 paddle/fluid/operators/jit/gen/jitcode.cc create mode 100644 paddle/fluid/operators/jit/helper.h diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 77fd27666f..26903e0e44 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -1,16 +1,17 @@ -# set(use_jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h) -# file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") -# file(APPEND ${pass_file} "\#pragma once\n") -# file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") - +set(jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h) +file(WRITE ${jit_file} "// Generated by the paddle/fluid/operators/jit/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${jit_file} "\#pragma once\n") +file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") +file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -list(REMOVE_ITEM jit_kernel_cc_srcs jit_test.cc) +list(REMOVE_ITEM jit_kernel_cc_srcs test.cc) cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) +# refer must go first add_subdirectory(refer) add_subdirectory(more) if(WITH_XBYAK) diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index c678ea33b8..98d9231faa 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -3,3 +3,10 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) + +function(USE_JITKERNEL_GEN TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n") +endfunction() + +# use gen jitcode kernel by name +USE_JITKERNEL_GEN(vmul) diff --git a/paddle/fluid/operators/jit/gen/jitcode.cc b/paddle/fluid/operators/jit/gen/jitcode.cc deleted file mode 100644 index 7aaf6a2ff6..0000000000 --- a/paddle/fluid/operators/jit/gen/jitcode.cc +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "paddle/fluid/operators/jit/gen/jitcode.h" - -namespace paddle { -namespace operators { -namespace jit {} // namespace jit -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h new file mode 100644 index 0000000000..c8da960a1e --- /dev/null +++ b/paddle/fluid/operators/jit/helper.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include // for unique_ptr +#include +#include +#include +#include "paddle/fluid/operators/jit/gen_base.h" +#include "paddle/fluid/operators/jit/kernel_base.h" +#include "paddle/fluid/operators/jit/kernel_key.h" +#include "paddle/fluid/operators/jit/kernel_pool.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace jit { + +// Refer code do not related with attr, and always on CPUPlace +template +inline Func GetRefer() { + auto& ref_pool = ReferKernelPool().Instance().AllKernels(); + KernelKey kkey(KT, platform::CPUPlace()); + auto ref_iter = ref_pool.find(kkey); + PADDLE_ENFORCE(ref_iter != ref_pool.end(), + "Every Kernel should have reference function."); + auto& ref_impls = ref_iter->second; + for (auto& impl : ref_impls) { + auto i = dynamic_cast*>(impl.get()); + if (i) { + return i->GetFunc(); + } + } + return nullptr; +} + +template +const Func Get(Attr attr) { + size_t key = JitCodeKey(attr); + auto& codes = JitCodePool().Instance(); + if (codes.Has(key)) { + return codes.AllKernels().at(key)->template getCode(); + } + + KernelKey kkey(KT, PlaceType()); + if (std::is_same::value) { + // pool: (KernelKey(type, place), vector) + auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto iter = creator_map.find(kkey); + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } + } + } + } + + // pool: (KernelKey(type, place), vector) + auto& pool = KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); + if (i && i->UseMe(attr)) { + return i->GetFunc(); + } + } + } + + // The last implementation should be reference function on CPUPlace. + return GetRefer(); +} + +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 6a789c52c3..df7be6ab8e 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -22,7 +22,7 @@ namespace jit { typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; template -struct VMulTypes { +struct VMulTuples { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index c9e7fc84e5..3e15242af2 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -114,69 +114,6 @@ class ReferKernelPool { DISABLE_COPY_AND_ASSIGN(ReferKernelPool); }; -// Refer code do not related with attr, and always on CPUPlace -template -inline Func GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); - auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); - auto& ref_impls = ref_iter->second; - for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); - if (i) { - return i->GetFunc(); - } - } - return nullptr; -} - -template -const Func Get(Attr attr) { - size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); - if (codes.Has(key)) { - return codes.AllKernels().at(key)->template getCode(); - } - - KernelKey kkey(KT, PlaceType()); - if (std::is_same::value) { - // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); - auto iter = creator_map.find(kkey); - auto& creators = iter->second; - for (auto& cur : creators) { - auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { - auto p = i->CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; - } - } - } - } - - // pool: (KernelKey(type, place), vector) - auto& pool = KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - return i->GetFunc(); - } - } - } - - // The last implementation should be reference function on CPUPlace. - return GetRefer(); -} - } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt index 84f1811ced..5bb78b9304 100644 --- a/paddle/fluid/operators/jit/more/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/CMakeLists.txt @@ -1,4 +1,7 @@ +function(USE_JITKERNEL_MORE TARGET TYPE) + file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n") +endfunction() if(WITH_MKLML) add_subdirectory(mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 94d2487866..0c15c7060d 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -1,3 +1,6 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) + +# use mkl kernels by name and type +USE_JITKERNEL_MORE(vmul, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 45cfec1c47..c0f738cceb 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -28,8 +28,8 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl::func_type, - typename VMulTypes::attr_type> { +class VMulKernel : public KernelImpl::func_type, + typename VMulTuples::attr_type> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 8c116e42dc..b6ff80d03d 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -1,3 +1,10 @@ cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) + +function(USE_JITKERNEL_REFER TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n") +endfunction() + +# use refer kernel by name +USE_JITKERNEL_REFER(vmul) diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 76a663633d..97aa5de8fc 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -29,8 +29,8 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel : public ReferKernel::func_type, - typename VMulTypes::attr_type> { +class VMulKernel : public ReferKernel::func_type, + typename VMulTuples::attr_type> { public: VMulKernel() { this->func = VMul; } }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 5af9ed697d..e531ba1a2c 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -19,10 +19,7 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/jit/kernel_pool.h" -// TODO(TJ): remove me -#include "paddle/fluid/operators/jit/registry.h" - +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" @@ -58,11 +55,6 @@ void ExpectEQ(const T* target, const T* refer, int n) { } } -// TODO(TJ): remove me -USE_JITKERNEL_MORE(vmul, mkl); -USE_JITKERNEL_REFER(vmul); -USE_JITKERNEL_GEN(vmul); - TEST(JitKernel, vmul) { using T = float; using PlaceType = paddle::platform::CPUPlace; @@ -70,10 +62,10 @@ TEST(JitKernel, vmul) { namespace jit = paddle::operators::jit; // TODO(TJ): test more vector size for (int d = 1; d < 30; ++d) { - auto ref = jit::GetRefer::func_type, - jit::VMulTypes::attr_type>(); - auto tgt = jit::Get::func_type, - jit::VMulTypes::attr_type, PlaceType>(d); + auto ref = jit::GetRefer::func_type, + jit::VMulTuples::attr_type>(); + auto tgt = jit::Get::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(tgt != nullptr); From 9f8d86858ea6156422c1cf2c59fdb516cb3f7868 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 10:43:50 +0800 Subject: [PATCH 233/719] Revert data_type test=develop --- paddle/fluid/framework/data_type.cc | 15 +++++++-------- paddle/fluid/framework/parallel_executor.cc | 3 +++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 1c29a89bff..28f3da88fa 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -14,7 +14,6 @@ #include "paddle/fluid/framework/data_type.h" #include -#include #include #include @@ -24,10 +23,10 @@ namespace paddle { namespace framework { struct DataTypeMap { - std::map cpp_to_proto_; + std::unordered_map cpp_to_proto_; std::unordered_map proto_to_cpp_; std::unordered_map proto_to_str_; - std::map cpp_to_size_; + std::unordered_map cpp_to_size_; }; static DataTypeMap* InitDataTypeMap(); @@ -44,9 +43,9 @@ static inline void RegisterType(DataTypeMap* map, proto::VarType::Type proto_type, const std::string& name) { map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); - map->cpp_to_proto_.emplace(typeid(T).name(), proto_type); + map->cpp_to_proto_.emplace(typeid(T), proto_type); map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(typeid(T).name(), sizeof(T)); + map->cpp_to_size_.emplace(typeid(T), sizeof(T)); } static DataTypeMap* InitDataTypeMap() { @@ -72,7 +71,7 @@ static DataTypeMap* InitDataTypeMap() { } proto::VarType::Type ToDataType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_proto_.find(type.name()); + auto it = gDataTypeMap().cpp_to_proto_.find(type); if (it != gDataTypeMap().cpp_to_proto_.end()) { return it->second; } @@ -98,8 +97,8 @@ std::string DataTypeToString(const proto::VarType::Type type) { } size_t SizeOfType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_size_.find(type.name()); - if (LIKELY(it != gDataTypeMap().cpp_to_size_.end())) { + auto it = gDataTypeMap().cpp_to_size_.find(type); + if (it != gDataTypeMap().cpp_to_size_.end()) { return it->second; } PADDLE_THROW("Not support %s as tensor type", type.name()); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 0636b89048..28a4b14b27 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -39,8 +39,11 @@ DEFINE_string(pe_profile_fname, "", namespace paddle { namespace framework { + static std::once_flag gProfileOnce; +#ifdef WITH_GPERFTOOLS static bool gProfileStarted = false; +#endif class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) From 92daace55ca25aa9091d402560c90654c0cf183d Mon Sep 17 00:00:00 2001 From: Michal Gallus Date: Mon, 10 Dec 2018 15:03:04 +0100 Subject: [PATCH 234/719] MKL-DNN Concat: Fix segfault related to referencing deleter memory primitive test=develop --- paddle/fluid/operators/concat_mkldnn_op.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc index b8456aac9d..7ad674056f 100644 --- a/paddle/fluid/operators/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -80,8 +80,8 @@ class ConcatPrimitiveFactory { concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd, Tensor* output, platform::CPUPlace place) { CreateSourcePrimitiveAts(); - auto dst_mem = CreateDstMemory(concat_pd, output, place); - return concat(concat_pd, inputs, dst_mem); + dst_mem = CreateDstMemory(concat_pd, output, place); + return concat(concat_pd, inputs, dst_mem.get()); } private: @@ -118,7 +118,8 @@ class ConcatPrimitiveFactory { std::vector srcs_pd; std::vector srcs; std::vector inputs; -}; + boost::optional dst_mem; // TODO(mgallus): change to std::optional +}; // upon introduction of C++17 to paddle template class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { From 57ac412b98990ac1d946ad32de30b07a15d0a18f Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 17:48:25 +0800 Subject: [PATCH 235/719] download data --- python/paddle/fluid/async_executor.py | 22 ++++++++++++++++++- python/paddle/fluid/contrib/utils/__init__.py | 4 ++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 787a6a6b9e..cce7ec5cca 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -25,6 +25,7 @@ from google.protobuf import text_format from . import io from .data_feed_desc import DataFeedDesc from .distributed import ps_instance +from .contrib.utils import hdfs_utils as hdfs __all__ = ['AsyncExecutor'] @@ -152,6 +153,22 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, debug) + def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): + hadoop_home = "$HADOOP_HOME" + + configs = { + "fs.default.name": fs_default_name, + "hadoop.job.ugi": ugi + } + + client = hdfs.HDFSClient(hadoop_home, configs) + downloads = hdfs.multi_download( + client, + afs_path, + local_path, + self.instance.get_worker_index(), + self.instance.get_node_cnt() / 2, + multi_processes=process_num) def config_distributed_nodes(self, dist_opt): @@ -179,10 +196,11 @@ class AsyncExecutor(object): self.executor.gather_servers(ips, self.instance.get_node_cnt()) self.instance.barrier_all() #wait all worker start self.instance.barrier_all() #wait init model + self.instance.barrier_all() #wait for download_data self.instance.barrier_all() #wait worker do all things self.instance.barrier_all() #sync - def init_worker(self, dist_desc): + def init_worker(self, dist_desc, afs_path, local_path, fs_default_name, ugi): self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) @@ -190,6 +208,8 @@ class AsyncExecutor(object): if self.instance.is_first_worker(): self.executor.init_model() self.instance.barrier_all() #wait init model + self.download_data(afs_path, local_path, fs_default_name, ugi, process_num=12) + self.instance.barrier_all() #wait for download_data def init_model(self): self.executor.init_model() diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py index 6e479bdc2b..2fe9f702f3 100644 --- a/python/paddle/fluid/contrib/utils/__init__.py +++ b/python/paddle/fluid/contrib/utils/__init__.py @@ -13,8 +13,8 @@ # limitations under the License. from __future__ import print_function -from . import lookup_table_utils -from .lookup_table_utils import * +#from . import lookup_table_utils +#from .lookup_table_utils import * from . import hdfs_utils from .hdfs_utils import * From 570338699b2038b802e9d49ea80efc916416477a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 11 Dec 2018 18:29:16 +0800 Subject: [PATCH 236/719] Add debug info --- .../details/computation_op_handle.cc | 45 ++++- .../fast_threaded_ssa_graph_executor.cc | 1 + .../fluid/framework/details/op_handle_base.cc | 2 +- paddle/fluid/framework/operator.cc | 160 +++++++++++------- paddle/fluid/framework/scope.cc | 37 ++-- .../operators/elementwise/elementwise_op.h | 69 ++++---- paddle/fluid/operators/optimizers/adam_op.cc | 79 ++++----- python/paddle/fluid/profiler.py | 3 +- 8 files changed, 239 insertions(+), 157 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c60..9003033438 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -26,17 +26,46 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_(scope), place_(place) {} +struct RecordTime { + RecordTime(const std::string &name, const std::string &type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + ~RecordTime() { + if (type_ == "elementsize_add") { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void ComputationOpHandle::RunImpl() { - WaitInputVarGenerated(place_); + { + RecordTime rt("ComputationOpHandle::RunImpl", "Wait"); + WaitInputVarGenerated(place_); + } + + Scope *scope = nullptr; + { + RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope"); + scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + + { + RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type()); - auto run_func = [this]() { - op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); - }; + auto run_func = [this, scope]() { op_->Run(*scope, place_); }; - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } } } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 949510e037..872bc5d654 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } + void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, OpHandleBase *op, diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3..5997f12ffa 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda) { + if (events_.empty() && use_cuda && !dev_ctxes_.empty()) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f..b8adce4edf 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -701,85 +701,125 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } +struct RecordTime { + RecordTime(const std::string& name, const std::string& type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + void inline stop() { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + + ~RecordTime() { + if (type_ == "elementwise_add") { + stop(); + } + // stop(); + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", type_); + RecordTime rt("OperatorWithKernel::All", type_); + { + RecordTime rt("OperatorWithKernel::InferShape", type_); + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); } - OpKernelMap& kernels = kernels_iter->second; + { + RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + type_); + } - // for (auto& candidate : kKernelPriority) { - // Do selection - // } + OpKernelMap& kernels = kernels_iter->second; - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - auto kernel_iter = kernels.find(expected_kernel_key); + // for (auto& candidate : kKernelPriority) { + // Do selection + // } + + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } #endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } - // do data transformScope &transfer_scope; - std::vector transfered_inplace_vars; - auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + Scope* transfer_scope = nullptr; + // auto* transfer_scope = + // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - // exec scope is the scope that kernel actually executed on. - const Scope& exec_scope = - (transfer_scope == nullptr ? scope : *transfer_scope); + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = scope; + // const Scope& exec_scope = + // (transfer_scope == nullptr ? scope : *transfer_scope); - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } + delete rt_1; - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + delete rt_2; - if (!transfered_inplace_vars.empty()) { - // there is inplace variable has been transfered. - TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); - } + RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_); + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } - /*For profiling/benchmark only*/ - if (FLAGS_benchmark) { - dev_ctx->Wait(); - } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } - if (FLAGS_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get().value()); + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, + var->Get().value()); + } } } + delete rt_3; } } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7cc..61416676d6 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -43,9 +43,16 @@ DEFINE_double( // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_LOCK_GUARD +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK #else -#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); +// TODO(minqiyang): use reader lock and writer lock in all platforms +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK +// #define SCOPE_READER_LOCK boost::shared_lock +// lock(mutex_); +// #define SCOPE_WRITER_LOCK boost::unique_lock +// lock(mutex_); #endif namespace paddle { @@ -61,18 +68,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -81,34 +88,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -118,7 +125,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -132,7 +139,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -145,12 +152,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b15..181baac870 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -33,34 +33,37 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); - - PADDLE_ENFORCE( - ctx->GetInputsVarType("Y").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s [%s]", - ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); - - if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { - PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && - (ctx->GetInputDim("Y")[0] == 1), - "For elementwise_op, if X is Sparse, " - "Y must be scalar."); - } else { - PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - ctx->GetInputsVarType("X").front()); + if (!ctx->IsRuntime()) { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the " + "received is %s [%s]", + ctx->GetInputsVarType("Y").front(), + ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); + } } ctx->ShareDim("X", /*->*/ "Out"); @@ -125,7 +128,7 @@ The equation is: $$%s$$ -- $X$: a tensor of any dimension. +- $X$: a tensor of any dimension. - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: @@ -135,10 +138,10 @@ There are two cases for this operator: For case 2: -1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index - for broadcasting $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. -3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape(Y) = (2, 1) => (2). For example: @@ -152,7 +155,7 @@ For example: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -The inputs $X$ and $Y$ can carry the different LoD information. +The inputs $X$ and $Y$ can carry the different LoD information. But the output only shares the LoD information with the input $X$. )DOC", diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39a..bc1b20321f 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -23,56 +23,57 @@ class AdamOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(Param) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(Grad) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment1"), - "Input(Moment1) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment2"), - "Input(Moment2) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - "Input(Beta1Pow) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - "Input(Beta2Pow) of AdamOp should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - "Output(Moment1Out) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - "Output(Moment2Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Param"), + // "Input(Param) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Grad"), + // "Input(Grad) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment1"), + // "Input(Moment1) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment2"), + // "Input(Moment2) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + // "Input(LearningRate) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + // "Input(Beta1Pow) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + // "Input(Beta2Pow) of AdamOp should not be null."); + + // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + // "Output(ParamOut) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + // "Output(Moment1Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + // "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - "Learning rate should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + // "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, - "Beta1 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + // "Beta1 power accumulator should have 1 dimension"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, - "Beta2 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + // "Beta2 power accumulator should have 1 dimension"); auto param_dims = ctx->GetInputDim("Param"); - if (ctx->GetInputsVarType("Grad")[0] == - framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - "Param and Grad input of AdamOp should have same dimension"); - } - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment1"), - "Param and Moment1 input of AdamOp should have same dimension"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment2"), - "Param and Moment2 input of AdamOp should have same dimension"); + // if (ctx->GetInputsVarType("Grad")[0] == + // framework::proto::VarType::LOD_TENSOR) { + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Grad"), + // "Param and Grad input of AdamOp should have same dimension"); + // } + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment1"), + // "Param and Moment1 input of AdamOp should have same dimension"); + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment2"), + // "Param and Moment2 input of AdamOp should have same dimension"); ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index e05885f5f5..8df2e01b03 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,8 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - core.nvprof_init(output_file, output_mode, config_file) + #Comment this for nvprof + #core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield From 82726402be966ede1e15486d88f9a17c1d1b52b9 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 11 Dec 2018 19:27:49 +0800 Subject: [PATCH 237/719] exception safe --- .../details/parallel_ssa_graph_executor.cc | 51 +++++++++++++++---- .../details/parallel_ssa_graph_executor.h | 1 + paddle/fluid/framework/parallel_executor.cc | 15 ------ paddle/fluid/framework/threadpool.h | 1 - 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index dfb40721d8..f1a07edf08 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -34,32 +34,63 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); } - VLOG(1) << "pool size: " << places_.size(); } FeedFetchList ParallelSSAGraphExecutor::Run( const std::vector &fetch_tensors) { - std::vector> run_futures; - FeedFetchList fetch_data; + std::vector> run_futures; + + std::vector fetch_datas; + FeedFetchList ret; + + fetch_datas.reserve(places_.size()); + ret.reserve(fetch_tensors.size()); + exception_holder_.Clear(); for (size_t i = 0; i < places_.size(); ++i) { - auto call = [this, i] { - // FIXME(Yancey1989): need to fix fetch data failed. - std::vector empty; - executors_[i]->Run(empty); + auto call = [this, i, &fetch_tensors]() -> FeedFetchList { + return executors_[i]->Run(fetch_tensors); }; + if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - call(); + try { + fetch_datas.emplace_back(std::move(call())); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + break; + } } } + if (pool_) { for (auto &f : run_futures) { - f.wait(); + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + try { + fetch_datas.emplace_back(std::move(f.get())); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + } + } + } + if (exception_holder_.IsCaught()) { + exception_holder_.ReThrow(); + } + + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.reserve(local_scopes_.size()); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + lodtensor_ptrs.push_back(&fetch_datas.at(scope_idx).at(fetch_idx)); } + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); } - return fetch_data; + return ret; } } // namespace details diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index 37784775f0..bd777e41f8 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -44,6 +44,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { std::vector> graphs_; std::vector> executors_; + ExceptionHolder exception_holder_; }; } // namespace details diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2a9ca3e815..82a7bd2185 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -202,21 +202,6 @@ ParallelExecutor::ParallelExecutor( } } } - /** - std::vector> var_infos_list; - for (size_t i = 0; i < graphs.size(); ++i) { - std::vector var_infos; - for (auto &node : graphs[i]->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } - } - var_infos_list.push_back(std::move(var_infos)); - } - **/ // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 5177b7ee02..8fd834be9a 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include // NOLINT #include #include // NOLINT From 5cc83f79bfa9516ef9c5f7f688f665deb47e0d07 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 11 Dec 2018 21:45:45 +0800 Subject: [PATCH 238/719] update by comment --- paddle/fluid/framework/parallel_executor.cc | 29 +++++++++++++-------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 82a7bd2185..b0cd1e8e90 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -110,23 +110,30 @@ ParallelExecutor::ParallelExecutor( // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - std::unique_ptr nccl_id = nullptr; + ncclUniqueId *nccl_id = nullptr; bool need_group_call = true; - if (nccl_id_var != nullptr) { - nccl_id.reset(nccl_id_var->GetMutable()); - } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { - nccl_id.reset(new ncclUniqueId()); - PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id.get())); - *member_->global_scope_->Var(NCCL_ID_VARNAME) - ->GetMutable() = *nccl_id.get(); + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + // parallel graph mode should initialize nccl by ncclCommInitRank since + // it call nccl operator per device per thread. + if (nccl_id_var == nullptr) { + nccl_id = new ncclUniqueId(); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); + *member_->global_scope_->Var(NCCL_ID_VARNAME) + ->GetMutable() = *nccl_id; + } else { + nccl_id = nccl_id_var->GetMutable(); + } need_group_call = false; + } else if (nccl_id_var != nullptr) { // the other executor type. + // the distributed training with nccl mode would initialize the nccl id in + // startup_program. + nccl_id = nccl_id_var->GetMutable(); } else { - // init nccl_id in NCCLContextMap + // initlize NCCL by ncclCommInitAll, do not need nccl_id. } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id.get(), num_trainers, trainer_id, - need_group_call)); + member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); #else PADDLE_THROW("Not compiled with CUDA"); #endif From 10ed9e0a6e3ab06e0b42172126bb8872828cbe60 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 11 Dec 2018 22:03:33 +0800 Subject: [PATCH 239/719] download & run & instance --- paddle/fluid/framework/async_executor.cc | 38 ++++++++++++++---------- paddle/fluid/framework/async_executor.h | 3 +- python/paddle/fluid/async_executor.py | 23 ++++++++------ 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 45a914b70e..f0ca375f95 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -191,18 +191,19 @@ void AsyncExecutor::SaveModel(const std::string& path) { } } -void AsyncExecutor::PrepareDenseThread() { - DensePullThreadParam param; - param.ps_client = _pslib_ptr->_worker_ptr;; - param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO - param.training_thread_num = actual_thread_num; - param.root_scope = root_scope_; - //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO - param.dense_params = &_param_config.dense_variable_name; - - _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); - _pull_dense_thread->start(); - +void AsyncExecutor::PrepareDenseThread(const std::string& mode) { + if (mode == "mpi") { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr;; + param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO + param.dense_params = &_param_config.dense_variable_name; + + _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread->start(); + } } void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, @@ -210,6 +211,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::vector& filelist, const int thread_num, const std::vector& fetch_var_names, + const std::string& mode, const bool debug) { std::vector threads; @@ -251,11 +253,15 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); - PrepareDenseThread(); + PrepareDenseThread(mode); std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { - worker.reset(new AsyncExecutorThreadWorker); + if (mode == "mpi") { + worker.reset(new AsyncExecutorThreadWorker); + } else { + worker.reset(new ExecutorThreadWorker); + } } // prepare thread resource here @@ -274,7 +280,9 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } - _pull_dense_thread->stop(); + if (mode == "mpi") { + _pull_dense_thread->stop(); + } root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 4b46126217..93010f8a9b 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -61,6 +61,7 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, + const std::string& mode, const bool debug = false); //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); void InitServer(const std::string& dist_desc, int index); @@ -79,7 +80,7 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); - void PrepareDenseThread(); + void PrepareDenseThread(const std::string& mode); public: std::shared_ptr _pslib_ptr; std::shared_ptr _pull_dense_thread; diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index cce7ec5cca..e760d58fd2 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -87,9 +87,8 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) - self.instance = ps_instance.PaddlePSInstance(1, 2) - def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): + def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): """ Run program by this AsyncExecutor. Training dataset will be in filelist. Users can also inspect certain variables by naming them in parameter @@ -151,10 +150,11 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, - fetch_var_names, debug) + fetch_var_names, mode, debug) def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): - hadoop_home = "$HADOOP_HOME" + #hadoop_home = "$HADOOP_HOME" + hadoop_home = "~/tools/hadoop-xingtian/hadoop/" configs = { "fs.default.name": fs_default_name, @@ -169,8 +169,11 @@ class AsyncExecutor(object): self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, multi_processes=process_num) + self.instance.barrier_all() #wait for download_data #TODO only barriere worker - def config_distributed_nodes(self, dist_opt): + def config_distributed_nodes(self): + self.instance = ps_instance.PaddlePSInstance(1, 2) + return self.instance # get total rank # get rank index @@ -196,11 +199,15 @@ class AsyncExecutor(object): self.executor.gather_servers(ips, self.instance.get_node_cnt()) self.instance.barrier_all() #wait all worker start self.instance.barrier_all() #wait init model - self.instance.barrier_all() #wait for download_data + self.instance.barrier_all() #wait for download_data #TODO remove this after only barrier worker self.instance.barrier_all() #wait worker do all things self.instance.barrier_all() #sync - def init_worker(self, dist_desc, afs_path, local_path, fs_default_name, ugi): + def init_worker(self, dist_desc, startup_program): + place = core.CPUPlace() + executor = Executor(place) + executor.run(startup_program) + self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) @@ -208,8 +215,6 @@ class AsyncExecutor(object): if self.instance.is_first_worker(): self.executor.init_model() self.instance.barrier_all() #wait init model - self.download_data(afs_path, local_path, fs_default_name, ugi, process_num=12) - self.instance.barrier_all() #wait for download_data def init_model(self): self.executor.init_model() From acc6ae49b18cb55db4dd84cd09069ebe01a1b54a Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Wed, 12 Dec 2018 00:31:59 +0800 Subject: [PATCH 240/719] Fix the issue to run on AVX2 and AVX512F machines (#14851) test=develop --- .../fluid/operators/math/jit_kernel_layer_norm.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index fead13ebad..cb49e66488 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -79,16 +79,16 @@ class LayerNormKernelImpl : public LayerNormKernel { } }; -#define INTRIAVX_FLOAT(isa, block) \ +#define INTRIAVX_FLOAT(isa, jit_block) \ template <> \ - LayerNormKernelImpl::LayerNormKernelImpl(int right) \ + LayerNormKernelImpl::LayerNormKernelImpl(int right) \ : LayerNormKernel() { \ this->num_ = right; \ this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ this->end_ = this->num_ - this->rest_; \ } \ template <> \ - void LayerNormKernelImpl::Compute( \ + void LayerNormKernelImpl::Compute( \ float* x, float* out, float* mean, float* var, const float* scale, \ const float* bias, int height, const float epsilon) const { \ __m256 sum; \ @@ -97,6 +97,7 @@ class LayerNormKernelImpl : public LayerNormKernel { __m256 tmp; \ size_t offset; \ size_t j; \ + size_t block = YMM_FLOAT_BLOCK; \ __m256 reverse_num_vec = \ _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_)); \ __m256 epsilon_vec = _mm256_set1_ps(epsilon); \ @@ -221,12 +222,14 @@ INTRIAVX_FLOAT(platform::avx, kEQ8); INTRIAVX_FLOAT(platform::avx, kGT8LT16); INTRIAVX_FLOAT(platform::avx, kEQ16); INTRIAVX_FLOAT(platform::avx, kGT16); -#endif -#ifdef __AVX2__ INTRIAVX_FLOAT(platform::avx2, kEQ8); INTRIAVX_FLOAT(platform::avx2, kGT8LT16); INTRIAVX_FLOAT(platform::avx2, kEQ16); INTRIAVX_FLOAT(platform::avx2, kGT16); +INTRIAVX_FLOAT(platform::avx512f, kEQ8); +INTRIAVX_FLOAT(platform::avx512f, kGT8LT16); +INTRIAVX_FLOAT(platform::avx512f, kEQ16); +INTRIAVX_FLOAT(platform::avx512f, kGT16); #endif #undef INTRIAVX_FLOAT From 28eb7d840c9baaf49303cada7fcc71f557abb78a Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Dec 2018 09:29:15 +0000 Subject: [PATCH 241/719] test all impls and all inplace cases --- paddle/fluid/operators/jit/helper.h | 53 ++++++++----- paddle/fluid/operators/jit/test.cc | 117 +++++++++++++++++++++------- 2 files changed, 121 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index c8da960a1e..09a6bc3d9d 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -28,33 +28,16 @@ namespace paddle { namespace operators { namespace jit { -// Refer code do not related with attr, and always on CPUPlace -template -inline Func GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); - auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); - auto& ref_impls = ref_iter->second; - for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); - if (i) { - return i->GetFunc(); - } - } - return nullptr; -} - template -const Func Get(Attr attr) { + typename PlaceType> +inline const Func GetJitCode(Attr attr) { size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { return codes.AllKernels().at(key)->template getCode(); } + // creator is not related with attr, so can use KernelKey as key KernelKey kkey(KT, PlaceType()); if (std::is_same::value) { // pool: (KernelKey(type, place), vector) @@ -73,8 +56,38 @@ const Func Get(Attr attr) { } } } + return nullptr; +} + +// Refer code do not related with attr, which is just for cast +// Refer is always on CPUPlace +template +inline Func GetRefer() { + auto& ref_pool = ReferKernelPool().Instance().AllKernels(); + KernelKey kkey(KT, platform::CPUPlace()); + auto ref_iter = ref_pool.find(kkey); + PADDLE_ENFORCE(ref_iter != ref_pool.end(), + "Every Kernel should have reference function."); + auto& ref_impls = ref_iter->second; + for (auto& impl : ref_impls) { + auto i = dynamic_cast*>(impl.get()); + if (i) { + return i->GetFunc(); + } + } + return nullptr; +} + +template +const Func Get(Attr attr) { + auto jitfunc = GetJitCode(attr); + if (jitfunc) { + return jitfunc; + } // pool: (KernelKey(type, place), vector) + KernelKey kkey(KT, PlaceType()); auto& pool = KernelPool().Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index e531ba1a2c..e523089101 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -55,46 +55,105 @@ void ExpectEQ(const T* target, const T* refer, int n) { } } +std::vector TestSizes() { + std::vector s; + for (int i = 1; i < 30; ++i) { + s.push_back(i); + } + // test some large size + s.push_back(100); + s.push_back(1000); + return s; +} + +template +void TestTartgetFunc(const Func tgt, const std::vector& x, + const std::vector& y, const std::vector& zref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); +} + TEST(JitKernel, vmul) { using T = float; using PlaceType = paddle::platform::CPUPlace; - namespace jit = paddle::operators::jit; - // TODO(TJ): test more vector size - for (int d = 1; d < 30; ++d) { - auto ref = jit::GetRefer::func_type, + const auto KT = jit::vmul; + for (int d : TestSizes()) { + auto ref = jit::GetRefer::func_type, jit::VMulTuples::attr_type>(); - auto tgt = jit::Get::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); EXPECT_TRUE(ref != nullptr); - EXPECT_TRUE(tgt != nullptr); - std::vector x(d), y(d); - std::vector zref(d), ztgt(d); + std::vector x(d), y(d), zref(d); RandomVec(d, x.data()); RandomVec(d, y.data()); - const float* x_data = x.data(); - const float* y_data = y.data(); - float* ztgt_data = ztgt.data(); - float* zref_data = zref.data(); - tgt(x_data, y_data, ztgt_data, d); + std::vector xinp(d), yinp(d); // inplace test + std::copy(x.begin(), x.end(), xinp.begin()); + std::copy(y.begin(), y.end(), yinp.begin()); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* xinp_data = xinp.data(); + T* yinp_data = yinp.data(); + + // test refer code inplace ref(x_data, y_data, zref_data, d); - ExpectEQ(ztgt_data, zref_data, d); - - // test inplace x - std::copy(x.begin(), x.end(), zref.begin()); - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ref(zref_data, y_data, zref_data, d); - ExpectEQ(ztgt_data, zref_data, d); - - // test inplace y - std::copy(y.begin(), y.end(), zref.begin()); - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ref(x_data, zref_data, zref_data, d); - ExpectEQ(ztgt_data, zref_data, d); + ref(x_data, yinp_data, yinp_data, d); + ref(xinp_data, y_data, xinp_data, d); + ExpectEQ(xinp_data, zref_data, d); + ExpectEQ(yinp_data, zref_data, d); + + // test jitcode + auto jitcode = jit::GetJitCode::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + if (jitcode) { + VLOG(10) << "Test jitcode, size: " << d; + TestTartgetFunc::func_type>(jitcode, x, y, zref); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast::func_type, + jit::VMulTuples::attr_type>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel, size: " << d; + TestTartgetFunc::func_type>(more, x, y, zref); + } + } + } + // Test result from Get function + VLOG(10) << "Test Get function, size: " << d; + auto tgt = jit::Get::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + TestTartgetFunc::func_type>(tgt, x, y, zref); } } From d538513fce27c9442553b39a1cff9e482c922196 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 11 Dec 2018 18:07:07 +0000 Subject: [PATCH 242/719] fix the compile error on mac --- paddle/fluid/operators/jit/gen_base.h | 6 +++--- paddle/fluid/operators/jit/helper.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 3b874cf2b0..586f4389c0 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -30,13 +30,13 @@ class GenBase : public Kernel { virtual const char* name() const = 0; virtual size_t getSize() const = 0; virtual const unsigned char* getCodeInternal() = 0; - template - const FUNC getCode() { + template + Func getCode() { const unsigned char* code = this->getCodeInternal(); if (FLAGS_dump_jitcode) { this->dumpCode(code); } - return reinterpret_cast(code); + return reinterpret_cast(const_cast(code)); } protected: diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 09a6bc3d9d..b7580f6efb 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -30,7 +30,7 @@ namespace jit { template -inline const Func GetJitCode(Attr attr) { +inline Func GetJitCode(Attr attr) { size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { @@ -80,7 +80,7 @@ inline Func GetRefer() { template -const Func Get(Attr attr) { +Func Get(Attr attr) { auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; From 1500c8e621d4d356d27c1483f7068839f8fb66f6 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 09:58:04 +0800 Subject: [PATCH 243/719] is instance is None --- python/paddle/fluid/async_executor.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index e760d58fd2..2a6a11805e 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -87,6 +87,7 @@ class AsyncExecutor(object): scope = global_scope() self.executor = core.AsyncExecutor(scope, p) + self.instance = None def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): """ @@ -154,6 +155,9 @@ class AsyncExecutor(object): def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): #hadoop_home = "$HADOOP_HOME" + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') + hadoop_home = "~/tools/hadoop-xingtian/hadoop/" configs = { @@ -182,15 +186,21 @@ class AsyncExecutor(object): pass def get_instance(self): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') return self.instance def stop_server(self): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.instance.barrier_all() #worker do all things if self.instance.is_first_worker(): self.executor.stop_server() self.instance.barrier_all() #sync def init_server(self, dist_desc): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.executor.init_server(dist_desc, self.instance._rankid) ip = self.executor.start_server() self.instance.set_ip(ip) @@ -204,6 +214,8 @@ class AsyncExecutor(object): self.instance.barrier_all() #sync def init_worker(self, dist_desc, startup_program): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') place = core.CPUPlace() executor = Executor(place) executor.run(startup_program) @@ -217,8 +229,12 @@ class AsyncExecutor(object): self.instance.barrier_all() #wait init model def init_model(self): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.executor.init_model() def save_model(self, save_path): + if self.instance is None: + raise ValueError('instance is None, please run config_distributed_nodes init instance') self.executor.save_model(save_path) From 7ec3264b513270fa7a70c2b5fec2166630568a2c Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 12 Dec 2018 10:56:00 +0800 Subject: [PATCH 244/719] fix API spec. test=develop --- paddle/fluid/API.spec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 87ed586aad..845abe7d5b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -77,8 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) -paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) From 9bd70a1e0433b5a930c43b1d7d2af67bc72d38a6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 11 Dec 2018 16:32:42 +0800 Subject: [PATCH 245/719] Change tensor uses proto::VarType::type test=develop --- .../fluid/framework/data_layout_transform.cc | 6 +- .../fluid/framework/data_layout_transform.h | 16 ++-- paddle/fluid/framework/data_type.cc | 24 ++---- paddle/fluid/framework/data_type.h | 77 +++++++++++-------- paddle/fluid/framework/data_type_test.cc | 6 +- .../framework/details/all_reduce_op_handle.cc | 2 +- .../framework/details/fuse_vars_op_handle.h | 4 +- .../framework/details/reduce_op_handle.cc | 4 +- paddle/fluid/framework/dlpack_tensor.cc | 37 ++++----- paddle/fluid/framework/dlpack_tensor_test.cc | 20 +---- .../fluid/framework/executor_thread_worker.cc | 46 ++++------- paddle/fluid/framework/lod_tensor.cc | 6 +- paddle/fluid/framework/operator.cc | 14 ++-- paddle/fluid/framework/selected_rows.cc | 4 +- paddle/fluid/framework/tensor.cc | 4 +- paddle/fluid/framework/tensor.h | 9 ++- paddle/fluid/framework/tensor_impl.h | 12 ++- paddle/fluid/framework/tensor_util.cc | 10 +-- .../fluid/inference/api/analysis_predictor.cc | 4 +- paddle/fluid/inference/api/api_impl.cc | 4 +- paddle/fluid/inference/api/api_impl_tester.cc | 4 +- paddle/fluid/operators/affine_grid_op.cc | 8 +- paddle/fluid/operators/arg_max_op.cc | 1 - paddle/fluid/operators/arg_max_op.cu | 2 - paddle/fluid/operators/arg_min_op.cc | 1 - paddle/fluid/operators/arg_min_op.cu | 2 - .../fluid/operators/array_to_lod_tensor_op.cc | 4 +- paddle/fluid/operators/attention_lstm_op.cc | 5 +- .../fluid/operators/average_accumulates_op.cc | 5 +- paddle/fluid/operators/batch_norm_op.cc | 20 ++--- .../fluid/operators/beam_search_decode_op.cc | 2 +- paddle/fluid/operators/beam_search_op.cc | 3 +- paddle/fluid/operators/bpr_loss_op.cc | 10 +-- .../controlflow/conditional_block_op.cc | 13 ++-- .../fluid/operators/controlflow/while_op.cc | 2 +- paddle/fluid/operators/conv_op.cc | 12 ++- paddle/fluid/operators/conv_transpose_op.cc | 10 +-- paddle/fluid/operators/crf_decoding_op.cc | 5 +- paddle/fluid/operators/crop_op.cc | 9 +-- paddle/fluid/operators/cross_entropy_op.cc | 10 +-- paddle/fluid/operators/ctc_align_op.cc | 5 +- .../detection/anchor_generator_op.cc | 3 +- .../operators/detection/bipartite_match_op.cc | 5 +- .../detection/density_prior_box_op.cc | 3 +- .../detection/generate_proposals_op.cc | 5 +- .../detection/mine_hard_examples_op.cc | 3 +- .../operators/detection/multiclass_nms_op.cc | 3 +- .../fluid/operators/detection/prior_box_op.cc | 3 +- .../detection/roi_perspective_transform_op.cc | 10 +-- .../detection/rpn_target_assign_op.cc | 3 +- .../operators/detection/target_assign_op.cc | 5 +- paddle/fluid/operators/detection_map_op.cc | 3 +- .../operators/elementwise/elementwise_op.h | 4 +- paddle/fluid/operators/fake_quantize_op.cc | 10 +-- paddle/fluid/operators/fc_op.cc | 10 +-- paddle/fluid/operators/fill_constant_op.cc | 4 +- paddle/fluid/operators/fill_op.cc | 4 +- .../fused/fused_elemwise_activation_op.cc | 10 +-- .../fused/fused_embedding_fc_lstm_op.cc | 3 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 5 +- .../fluid/operators/fused/fusion_lstm_op.cc | 5 +- .../fused/fusion_seqconv_eltadd_relu_op.cc | 5 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 5 +- paddle/fluid/operators/gather_op.cc | 10 +-- paddle/fluid/operators/grid_sampler_op.cc | 12 +-- paddle/fluid/operators/group_norm_op.cc | 3 +- .../operators/hierarchical_sigmoid_op.cc | 10 +-- paddle/fluid/operators/interpolate_op.cc | 8 +- paddle/fluid/operators/is_empty_op.cc | 3 +- paddle/fluid/operators/isfinite_op.cc | 5 +- paddle/fluid/operators/layer_norm_op.cc | 3 +- paddle/fluid/operators/linear_chain_crf_op.cc | 9 +-- paddle/fluid/operators/load_combine_op.cc | 2 +- paddle/fluid/operators/load_op.cc | 2 +- paddle/fluid/operators/lod_reset_op.cc | 10 +-- .../fluid/operators/lod_tensor_to_array_op.cc | 2 +- .../fluid/operators/lookup_sparse_table_op.cc | 3 +- paddle/fluid/operators/lrn_op.cc | 5 +- paddle/fluid/operators/lstm_op.cc | 6 +- paddle/fluid/operators/lstmp_op.cc | 6 +- paddle/fluid/operators/math/math_function.cc | 6 +- paddle/fluid/operators/math/math_function.cu | 2 +- paddle/fluid/operators/mean_iou_op.cc | 5 +- paddle/fluid/operators/mean_op.cc | 4 +- paddle/fluid/operators/merge_lod_tensor_op.cc | 4 +- paddle/fluid/operators/metrics/accuracy_op.cc | 5 +- paddle/fluid/operators/metrics/auc_op.cc | 5 +- .../operators/metrics/precision_recall_op.cc | 5 +- paddle/fluid/operators/multiplex_op.cc | 10 +-- paddle/fluid/operators/nce_op.cc | 10 +-- .../fluid/operators/optimizers/adadelta_op.cc | 5 +- .../fluid/operators/optimizers/adagrad_op.cc | 5 +- paddle/fluid/operators/optimizers/adam_op.cc | 3 +- .../fluid/operators/optimizers/adamax_op.cc | 5 +- .../optimizers/decayed_adagrad_op.cc | 5 +- paddle/fluid/operators/optimizers/ftrl_op.cc | 3 +- .../optimizers/proximal_adagrad_op.cc | 5 +- .../operators/optimizers/proximal_gd_op.cc | 5 +- paddle/fluid/operators/pad2d_op.cc | 8 +- .../fluid/operators/pad_constant_like_op.cc | 10 +-- paddle/fluid/operators/pool_op.cc | 7 +- paddle/fluid/operators/pool_with_index_op.cc | 10 +-- .../operators/positive_negative_pair_op.cc | 5 +- paddle/fluid/operators/prelu_op.cc | 10 +-- paddle/fluid/operators/print_op.cc | 2 +- paddle/fluid/operators/random_crop_op.cc | 5 +- .../reader/create_batch_reader_op.cc | 4 +- paddle/fluid/operators/recurrent_op.cc | 2 +- paddle/fluid/operators/reshape_op.cc | 14 ++-- .../fluid/operators/rnn_memory_helper_op.cc | 2 +- paddle/fluid/operators/roi_align_op.cc | 10 +-- paddle/fluid/operators/roi_pool_op.cc | 10 +-- paddle/fluid/operators/save_combine_op.cc | 2 +- paddle/fluid/operators/save_op.cc | 2 +- paddle/fluid/operators/scatter_op.cc | 10 +-- .../sequence_ops/sequence_pool_op.cc | 5 +- .../sequence_ops/sequence_scatter_op.cc | 10 +-- .../sequence_ops/sequence_slice_op.cc | 10 +-- .../sequence_ops/sequence_softmax_op.cc | 4 +- paddle/fluid/operators/similarity_focus_op.cc | 5 +- paddle/fluid/operators/slice_op.cc | 5 +- paddle/fluid/operators/softmax_op.cc | 7 +- .../softmax_with_cross_entropy_op.cc | 8 +- paddle/fluid/operators/sum_op.cc | 13 ++-- .../operators/tensorrt/tensorrt_engine_op.h | 5 +- paddle/fluid/operators/transpose_op.cc | 9 +-- paddle/fluid/operators/unpool_op.cc | 10 +-- paddle/fluid/operators/warpctc_op.cc | 10 +-- paddle/fluid/operators/yolov3_loss_op.cc | 10 +-- paddle/fluid/platform/nccl_helper.h | 11 +-- paddle/fluid/pybind/pybind.cc | 2 +- paddle/fluid/pybind/tensor_py.h | 2 +- 132 files changed, 407 insertions(+), 576 deletions(-) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 5467f6d1b2..72c50518af 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -85,7 +85,7 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, out->mutable_data(expected_kernel_type.place_, in.type()); framework::VisitDataType( - framework::ToDataType(in.type()), + in.type(), CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out)); out->set_layout(expected_kernel_type.data_layout_); @@ -101,7 +101,7 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { case mkldnn::memory::data_type::f32: return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s8: - return platform::to_void_cast(tensor.data()); + return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::u8: return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s16: @@ -144,7 +144,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, memory::data_type in_type = ToMKLDNNDataType(in.type()); PADDLE_ENFORCE(in_type != memory::data_type::data_undef, - "Input tensor type is not supported: ", in.type().name()); + "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 90bb206ec6..2479de4fd4 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -50,14 +50,14 @@ inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { } } -inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { - static const std::map dict{ - {std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT - {std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT - {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8}, - {std::type_index(typeid(int16_t)), MKLDNNDataType::s16}, - {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}}; - auto iter = dict.find(type); +inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { + static std::unordered_map dict{ + {DataTypeTrait::DataType, MKLDNNDataType::f32}, + {DataTypeTrait::DataType, MKLDNNDataType::s8}, + {DataTypeTrait::DataType, MKLDNNDataType::u8}, + {DataTypeTrait::DataType, MKLDNNDataType::s16}, + {DataTypeTrait::DataType, MKLDNNDataType::s32}}; + auto iter = dict.find(static_cast(type)); if (iter != dict.end()) return iter->second; return MKLDNNDataType::data_undef; } diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 28f3da88fa..a0248cf3c7 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -26,7 +26,7 @@ struct DataTypeMap { std::unordered_map cpp_to_proto_; std::unordered_map proto_to_cpp_; std::unordered_map proto_to_str_; - std::unordered_map cpp_to_size_; + std::unordered_map proto_to_size_; }; static DataTypeMap* InitDataTypeMap(); @@ -45,7 +45,7 @@ static inline void RegisterType(DataTypeMap* map, map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); map->cpp_to_proto_.emplace(typeid(T), proto_type); map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(typeid(T), sizeof(T)); + map->proto_to_size_.emplace(static_cast(proto_type), sizeof(T)); } static DataTypeMap* InitDataTypeMap() { @@ -54,17 +54,7 @@ static DataTypeMap* InitDataTypeMap() { #define RegType(cc_type, proto_type) \ RegisterType(retv, proto_type, #cc_type) - // NOTE: Add your customize type here. - RegType(float16, proto::VarType::FP16); - RegType(float, proto::VarType::FP32); - RegType(double, proto::VarType::FP64); - RegType(int, proto::VarType::INT32); - RegType(int64_t, proto::VarType::INT64); - RegType(bool, proto::VarType::BOOL); - RegType(size_t, proto::VarType::SIZE_T); - RegType(int16_t, proto::VarType::INT16); - RegType(uint8_t, proto::VarType::UINT8); - RegType(int8_t, proto::VarType::INT8); + _ForEachDataType_(RegType); #undef RegType return retv; @@ -96,12 +86,12 @@ std::string DataTypeToString(const proto::VarType::Type type) { static_cast(type)); } -size_t SizeOfType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_size_.find(type); - if (it != gDataTypeMap().cpp_to_size_.end()) { +size_t SizeOfType(proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_size_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_size_.end()) { return it->second; } - PADDLE_THROW("Not support %s as tensor type", type.name()); + PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type)); } } // namespace framework diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index d5be43b33e..76df78ea5e 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -22,46 +22,59 @@ limitations under the License. */ namespace paddle { namespace framework { +template +struct DataTypeTrait {}; + +// Stub handle for void +template <> +struct DataTypeTrait { + constexpr static auto DataType = proto::VarType::RAW; +}; + +#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ + callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); + +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8) + +#define DefineDataTypeTrait(cpp_type, proto_type) \ + template <> \ + struct DataTypeTrait { \ + constexpr static auto DataType = proto_type; \ + } + +_ForEachDataType_(DefineDataTypeTrait); + +#undef DefineDataTypeTrait + extern proto::VarType::Type ToDataType(std::type_index type); extern std::type_index ToTypeIndex(proto::VarType::Type type); template inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { - switch (type) { - case proto::VarType::FP16: - visitor.template apply(); - break; - case proto::VarType::FP32: - visitor.template apply(); - break; - case proto::VarType::FP64: - visitor.template apply(); - break; - case proto::VarType::INT32: - visitor.template apply(); - break; - case proto::VarType::INT64: - visitor.template apply(); - break; - case proto::VarType::BOOL: - visitor.template apply(); - break; - case proto::VarType::UINT8: - visitor.template apply(); - break; - case proto::VarType::INT16: - visitor.template apply(); - break; - case proto::VarType::INT8: - visitor.template apply(); - break; - default: - PADDLE_THROW("Not supported %d", type); - } +#define VisitDataTypeCallback(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(VisitDataTypeCallback); +#undef VisitDataTypeCallback + PADDLE_THROW("Not supported %d", type); } extern std::string DataTypeToString(const proto::VarType::Type type); -extern size_t SizeOfType(std::type_index type); +extern size_t SizeOfType(proto::VarType::Type type); inline std::ostream& operator<<(std::ostream& out, const proto::VarType::Type& type) { out << DataTypeToString(type); diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 54c41c55ba..92639dfc61 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -26,13 +26,13 @@ TEST(DataType, float16) { Tensor tensor; CPUPlace cpu; - tensor.mutable_data(cpu, f::ToTypeIndex(dtype)); + tensor.mutable_data(cpu, dtype); // test fp16 tensor - EXPECT_EQ(tensor.type(), std::type_index(typeid(float16))); + EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16))); // test fp16 size - EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u); + EXPECT_EQ(f::SizeOfType(dtype), 2u); // test debug info std::string type = "float16"; diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index e8bf53e160..9eaff1f560 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() { // Reduce All Tensor to trg in CPU ReduceLoDTensor func(lod_tensors, &trg); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); for (size_t i = 1; i < local_scopes_.size(); ++i) { auto &scope = diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h index 3f360c510a..b40b01df36 100644 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.h +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -33,7 +33,7 @@ struct FuseVarsOpHandle : public OpHandleBase { FuseVarsOpHandle(ir::Node *node, Scope *local_scope, const platform::Place &place, const std::unordered_map &inputs_numel, - const std::type_index &var_type) + const proto::VarType::Type var_type) : OpHandleBase(node), local_scope_(local_scope), place_(place), @@ -57,7 +57,7 @@ struct FuseVarsOpHandle : public OpHandleBase { Scope *local_scope_; const platform::Place place_; const std::unordered_map inputs_numel_; - const std::type_index type_; + const proto::VarType::Type type_; int64_t total_numel_; }; } // namespace details diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index cb864848b9..85d8abc910 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -246,7 +246,7 @@ void ReduceOpHandle::RunImpl() { if (!FLAGS_cpu_deterministic) { ReduceLoDTensor func(lod_tensors, out_var->GetMutable()); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); } else { // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0 // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0. @@ -256,7 +256,7 @@ void ReduceOpHandle::RunImpl() { ->FindVar(out_var_handle->name_) ->GetMutable(); ReduceLoDTensor func(lod_tensors, &reduce_sum_trg); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); auto trg = out_var->GetMutable(); if (reduce_sum_trg.data() != trg->data()) { diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 04e3f78afe..eaef093ed3 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/dlpack_tensor.h" - +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace framework { @@ -36,26 +36,23 @@ static ::DLDataType GetDLDataTypeCode() { return dtype; } -static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { -#define REG_DL_DATA_TYPE(type) \ - { std::type_index(typeid(type)), GetDLDataTypeCode() } - static const std::unordered_map - type_to_dtype_map({ - REG_DL_DATA_TYPE(platform::float16), // NOLINT - REG_DL_DATA_TYPE(float), // NOLINT - REG_DL_DATA_TYPE(double), // NOLINT - REG_DL_DATA_TYPE(int), // NOLINT - REG_DL_DATA_TYPE(int64_t), // NOLINT - REG_DL_DATA_TYPE(bool), // NOLINT - REG_DL_DATA_TYPE(size_t), // NOLINT - REG_DL_DATA_TYPE(int16_t), // NOLINT - REG_DL_DATA_TYPE(uint8_t), // NOLINT - REG_DL_DATA_TYPE(int8_t) // NOLINT - }); +static std::unordered_map CreateDLDataTypeMap() { + static std::unordered_map result; + +#define REG_DL_DATA_TYPE(cpp_type, proto_type) \ + result[static_cast(proto_type)] = GetDLDataTypeCode() + + _ForEachDataType_(REG_DL_DATA_TYPE); +#undef REG_DL_DATA_TYPE + return result; +} + +static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { + static auto type_to_dtype_map = CreateDLDataTypeMap(); static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); - auto it = type_to_dtype_map.find(type); - PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s", - type.name()); + auto it = type_to_dtype_map.find(static_cast(type)); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d", + type); return it->second; #undef REG_DL_DATA_TYPE } diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 938b056350..c0a8e1bcdf 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -91,23 +91,11 @@ void TestMainLoop() { } } } +TEST(dlpack, test_all) { +#define TestCallback(cpp_type, proto_type) TestMainLoop() -#define PADDLE_DLPACK_TEST(type) \ - TEST(dlpack, test_##type) { TestMainLoop(); } - -using float16 = platform::float16; -PADDLE_DLPACK_TEST(float16); -PADDLE_DLPACK_TEST(float); -PADDLE_DLPACK_TEST(double); -PADDLE_DLPACK_TEST(int); -PADDLE_DLPACK_TEST(int64_t); -PADDLE_DLPACK_TEST(bool); -PADDLE_DLPACK_TEST(size_t); -PADDLE_DLPACK_TEST(int16_t); -PADDLE_DLPACK_TEST(uint8_t); -PADDLE_DLPACK_TEST(int8_t); - -#undef PADDLE_DLPACK_TEST + _ForEachDataType_(TestCallback); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 3d53511615..f03f39dfc6 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -138,39 +138,19 @@ void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) { std::cout << sstream.str() << std::endl; } -void print_fetch_var(Scope* scope, std::string var_name) { - const LoDTensor& tensor = scope->FindVar(var_name)->Get(); - - if (std::type_index(tensor.type()) == - std::type_index(typeid(platform::float16))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(double))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int64_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(uint8_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int16_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int8_t))) { - print_lod_tensor(var_name, tensor); - } else { - VLOG(1) << "print_fetch_var: unrecognized data type:" - << tensor.type().name(); - } - - return; +static void print_fetch_var(Scope* scope, const std::string& var_name) { + auto& tensor = scope->FindVar(var_name)->Get(); + +#define PrintLoDTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor.type() == proto_type) { \ + print_lod_tensor(var_name, tensor); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(PrintLoDTensorCallback); + VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); } void ExecutorThreadWorker::TrainFiles() { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 9b2eeaf59a..6c8bec32de 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -70,9 +70,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { // only print first ten elements int64_t size = t.numel() < 10 ? t.numel() : 10; for (int64_t i = 0; i < size; ++i) { - if (IsType(t.type())) { + if (t.type() == proto::VarType::FP32) { os << t.data()[i] << " "; - } else if (IsType(t.type())) { + } else if (t.type() == proto::VarType::INT64) { os << t.data()[i] << " "; } else { PADDLE_THROW("LoDTensor data type not in [float, int64_t]"); @@ -387,7 +387,7 @@ void LoDTensor::MergeLoDTensor( PADDLE_ENFORCE(!lod_tensors.empty()); framework::DDim new_dim = lod_tensors[0]->dims(); - std::type_index new_type = lod_tensors[0]->type(); + auto new_type = lod_tensors[0]->type(); framework::DataLayout new_layout = lod_tensors[0]->layout(); LoD new_lod = lod_tensors[0]->lod(); for (size_t i = 1; i < lod_tensors.size(); ++i) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f..05ab48412a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -43,10 +43,9 @@ std::vector> kKernelPriority = { proto::VarType::Type GetDataTypeOfVar(const Variable* var) { if (var->IsType()) { - return framework::ToDataType(var->Get().type()); + return var->Get().type(); } else if (var->IsType()) { - return framework::ToDataType( - var->Get().value().type()); + return var->Get().value().type(); } else { PADDLE_THROW("Var should be LoDTensor or SelectedRows"); } @@ -93,13 +92,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { if (UNLIKELY(!tensor.IsInitialized())) { return ""; } - return DataTypeToString(ToDataType(tensor.type())); + return DataTypeToString(tensor.type()); } else if (var->IsType()) { auto tensor = var->Get().value(); if (UNLIKELY(!tensor.IsInitialized())) { return "uninited"; } else { - return DataTypeToString(ToDataType(tensor.type())); + return DataTypeToString(tensor.type()); } } else { return ""; @@ -686,7 +685,8 @@ static void CheckTensorNANOrInf(const std::string& name, if (tensor.memory_size() == 0) { return; } - if (!IsType(tensor.type()) && !IsType(tensor.type())) { + if (tensor.type() != proto::VarType::FP32 && + tensor.type() != proto::VarType::FP64) { return; } PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), @@ -879,7 +879,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { - int tmp = static_cast(ToDataType(t->type())); + int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 62a30815d4..54a818250b 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -218,11 +218,11 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, if (index < 0) { VLOG(5) << "id " << id << " not in the table, return 0"; framework::VisitDataType( - framework::ToDataType(value_->type()), + value_->type(), TensorFillVisitor(value, i * value_width, value_width, 0.0)); } else { framework::VisitDataType( - framework::ToDataType(value_->type()), + value_->type(), TensorCopyVisitor(value, i * value_width, *value_.get(), index * value_width, value_width)); } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 41566800e5..57335847a1 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace framework { -extern size_t SizeOfType(std::type_index type); +extern size_t SizeOfType(proto::VarType::Type type); void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); @@ -31,7 +31,7 @@ size_t Tensor::memory_size() const { return holder_ == nullptr ? 0UL : holder_->size() - offset_; } -void* Tensor::mutable_data(platform::Place place, std::type_index type, +void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type, memory::Allocator::Attr attr, size_t requested_size) { type_ = type; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 71e8badd4b..057fe1f98c 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -67,7 +68,7 @@ class Tensor { friend struct EigenVector; public: - Tensor() : type_(typeid(float)), offset_(0) {} + Tensor() : type_(proto::VarType::FP32), offset_(0) {} /*! Return a pointer to mutable memory block. */ template @@ -88,7 +89,7 @@ class Tensor { memory::Allocator::Attr attr = memory::Allocator::kDefault, size_t requested_size = 0); - void* mutable_data(platform::Place place, std::type_index type, + void* mutable_data(platform::Place place, proto::VarType::Type type, memory::Allocator::Attr attr = memory::Allocator::kDefault, size_t requested_size = 0); @@ -138,7 +139,7 @@ class Tensor { return holder_->place(); } - std::type_index type() const { + proto::VarType::Type type() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor not initialized yet when Tensor::type() is called."); return type_; @@ -161,7 +162,7 @@ class Tensor { private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; - std::type_index type_; + proto::VarType::Type type_; /** * @brief points to elements dimensions. * diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 0c9c0d782f..ce3ad18b1f 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -24,9 +24,8 @@ template inline const T* Tensor::data() const { check_memory_size(); bool valid = - std::is_same::value || type_ == std::type_index(typeid(T)); - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - type_.name()); + std::is_same::value || type_ == DataTypeTrait::DataType; + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); @@ -38,9 +37,8 @@ template inline T* Tensor::data() { check_memory_size(); bool valid = - std::is_same::value || type_ == std::type_index(typeid(T)); - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - type_.name()); + std::is_same::value || type_ == DataTypeTrait::DataType; + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", type_); return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } @@ -60,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) { static_assert(std::is_pod::value, "T must be POD"); return reinterpret_cast( - mutable_data(place, typeid(T), attr, requested_size)); + mutable_data(place, DataTypeTrait::DataType, attr, requested_size)); } inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index ca1e01c89f..85d15c5d3f 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -186,8 +186,8 @@ struct AnyDTypeVisitor { template inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, const DevCtx& ctx, framework::Tensor* out) { - VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( - predicate, tensor, ctx, out)); + VisitDataType(tensor.type(), AnyDTypeVisitor( + predicate, tensor, ctx, out)); } template @@ -379,7 +379,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, // int32_t size // void* protobuf message proto::VarType::TensorDesc desc; - desc.set_data_type(framework::ToDataType(tensor.type())); + desc.set_data_type(tensor.type()); auto dims = framework::vectorize(tensor.dims()); auto* pb_dims = desc.mutable_dims(); pb_dims->Resize(static_cast(dims.size()), 0); @@ -461,9 +461,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, tensor->Resize(framework::make_ddim(dims)); void* buf; auto ctx = platform::CPUDeviceContext(); - size_t size = - tensor->numel() * - framework::SizeOfType(framework::ToTypeIndex(desc.data_type())); + size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA Tensor cpu_tensor; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index be51e7fc1f..c751e85158 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -289,10 +289,10 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; - if (type == typeid(float)) { + if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; - } else if (type == typeid(int64_t)) { + } else if (type == framework::proto::VarType::INT64) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 4c5b412a2c..3d121e0460 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -266,10 +266,10 @@ bool NativePaddlePredictor::GetFetch(std::vector *outputs, auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; - if (type == typeid(float)) { + if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; - } else if (type == typeid(int64_t)) { + } else if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 014bdc6a37..191225493c 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -36,10 +36,10 @@ namespace paddle { PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { PaddleTensor pt; - if (t->type() == typeid(int64_t)) { + if (t->type() == framework::proto::VarType::INT64) { pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; - } else if (t->type() == typeid(float)) { + } else if (t->type() == framework::proto::VarType::INT32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 6f7da445fc..1de59a5165 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -78,7 +78,7 @@ class AffineGridOp : public framework::OperatorWithKernel { library = framework::LibraryType::kCUDNN; } #endif - auto data_type = framework::ToDataType(ctx.Input("Theta")->type()); + auto data_type = ctx.Input("Theta")->type(); return framework::OpKernelType(data_type, ctx.GetPlace(), framework::DataLayout::kAnyLayout, library); } @@ -188,9 +188,9 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Theta")->type()), - ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_); + return framework::OpKernelType(ctx.Input("Theta")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); } }; diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 8174d37358..7fe9a0df74 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL( int32_t>, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu index a147d77a9e..85e4f98173 100644 --- a/paddle/fluid/operators/arg_max_op.cu +++ b/paddle/fluid/operators/arg_max_op.cu @@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL( int32_t>, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 41f188029f..23b24735cd 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL( int32_t>, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu index 4d02050850..47d7c8b122 100644 --- a/paddle/fluid/operators/arg_min_op.cu +++ b/paddle/fluid/operators/arg_min_op.cu @@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL( int32_t>, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 6257e04b01..d942391b86 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -58,7 +58,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor { ArrayToLoDFunctorImpl functor; functor.dev_ctx_ = dev_ctx; functor.prev_functor_ = this; - framework::VisitDataType(framework::ToDataType(out->type()), functor); + framework::VisitDataType(out->type(), functor); } }; @@ -91,7 +91,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { PADDLE_ENFORCE(!x.empty(), "There's no element in the input array."); int rank = x[0].dims().size(); platform::Place place = x[0].place(); - std::type_index data_type = x[0].type(); + auto data_type = x[0].type(); int64_t batch_size = x[0].dims()[0]; framework::DDim ins_dims = rank > 1 ? framework::slice_ddim(x[0].dims(), 1, rank) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 75fc59125f..b6996be4b0 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -121,9 +121,8 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void AttentionLSTMOpMaker::Make() { diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index f389eab605..0922b03b5f 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -103,9 +103,8 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("param")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index f66813989c..8b672e09b2 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -72,8 +72,7 @@ class BatchNormOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = ctx.Input("X")->type(); // By default, the type of the scale, bias, mean, // and var tensors should both be float. (For float or float16 input tensor) // or double (For double input tensor). @@ -81,17 +80,13 @@ class BatchNormOp : public framework::OperatorWithKernel { if (input_data_type == framework::proto::VarType::FP64) { bn_param_type = framework::proto::VarType::FP64; } - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Scale")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), "Scale input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Bias")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), "Bias input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Mean")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), "Mean input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType( - ctx.Input("Variance")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), "Variance input should be of float type"); // TODO(pzelazko-intel): enable MKLDNN layout when it's ready @@ -413,9 +408,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel { } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout, library); } }; diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 0d32cae0e1..ae9765b761 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -145,7 +145,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase { LoDTensor* sentenceScores = ctx.Output("SentenceScores"); framework::VisitDataType( - framework::ToDataType(scores->at(0).type()), + scores->at(0).type(), BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores, beam_size, end_id)); } diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 62771d09f1..30f700f1d9 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -282,8 +282,7 @@ class BeamSearchOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType( - ctx.Input("pre_ids")->type()), + ctx.Input("pre_ids")->type(), platform::CPUPlace()); return kt; } diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 9258d7c7e8..f349c51d8a 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -47,9 +47,8 @@ class BprLossOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; @@ -94,9 +93,8 @@ class BprLossGradientOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index 135254ce6b..dd28f82b65 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -48,13 +48,12 @@ class ConditionalOp : public framework::OperatorBase { if (!(ips.size() == 1UL && ips[0]->IsInitialized())) { PADDLE_THROW("should have one initialized input as condition"); } - if (!(framework::IsType(ips[0]->type()) && // NOLINT - ips[0]->numel() == 1)) { - PADDLE_THROW( - "condition input's data type should be bool, " - "numel should be 1, actual numel is %d", - ips[0]->numel()); - } + + PADDLE_ENFORCE(ips[0]->type() == framework::proto::VarType::BOOL && + ips[0]->numel() == 1, + "condition input's data type should be bool, " + "numel should be 1, actual numel is %d", + ips[0]->numel()); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 6c1b2f329a..66f8508f02 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -237,7 +237,7 @@ class WhileGradOp : public framework::OperatorBase { if (var->IsType()) { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; - attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["dtype"] = inside_tensor.type(); attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); attrs["value"] = 0.0f; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index d7b8766288..183850db18 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -95,10 +95,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } #endif - auto input_data_type = - framework::ToDataType(ctx.Input("Input")->type()); - auto filter_data_type = - framework::ToDataType(ctx.Input("Filter")->type()); + auto input_data_type = ctx.Input("Input")->type(); + auto filter_data_type = ctx.Input("Filter")->type(); PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, "input and filter data type should be consistent"); @@ -382,9 +380,9 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_, customized_type_value); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_, + customized_type_value); } } // namespace operators diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 2fdfc40d19..86a140f152 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -104,9 +104,8 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); } void Conv2DTransposeOpMaker::Make() { @@ -335,9 +334,8 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); } } // namespace operators diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index c27befe114..81c9e9e543 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -118,9 +118,8 @@ class CRFDecodingOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Emission")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Emission")->type(), + platform::CPUPlace()); } }; } // namespace operators diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index a2a871efa8..97d20681b8 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -51,9 +51,8 @@ class CropOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -174,9 +173,7 @@ class CropOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), + ctx.Input(framework::GradVarName("Out"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a904dd9130..1968e54b00 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -57,9 +57,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -111,9 +110,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc index d2b440d9d2..e7c472f8c0 100644 --- a/paddle/fluid/operators/ctc_align_op.cc +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -36,9 +36,8 @@ class CTCAlignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc index 0c0155a0a9..f2984d1af2 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cc +++ b/paddle/fluid/operators/detection/anchor_generator_op.cc @@ -53,8 +53,7 @@ class AnchorGeneratorOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc index c23b65fe4d..b7da1261a8 100644 --- a/paddle/fluid/operators/detection/bipartite_match_op.cc +++ b/paddle/fluid/operators/detection/bipartite_match_op.cc @@ -45,9 +45,8 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("DistMat")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("DistMat")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc index 1012ba3652..cacd47ed4a 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cc +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -66,8 +66,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.GetPlace()); + ctx.Input("Input")->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 709c2dfc4b..2c46803fd0 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -66,9 +66,8 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Anchors")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Anchors")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc index 54a4b87ec8..f70e6adb5b 100644 --- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc +++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc @@ -249,8 +249,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("ClsLoss")->type()), - platform::CPUPlace()); + ctx.Input("ClsLoss")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index f0f8851be0..2395b18148 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -65,8 +65,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Scores")->type()), + ctx.Input("Scores")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc index b5cb6a724c..3e75c0394f 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cc +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -72,8 +72,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index 42c720e701..3796854fe6 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -498,9 +498,8 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -519,9 +518,8 @@ class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 46fff9d338..dc6c3d5a66 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -78,8 +78,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Anchor")->type()), + ctx.Input("Anchor")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc index 3670019392..c057c82ce0 100644 --- a/paddle/fluid/operators/detection/target_assign_op.cc +++ b/paddle/fluid/operators/detection/target_assign_op.cc @@ -57,9 +57,8 @@ class TargetAssignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index d7f49a9590..e1d113f854 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -71,8 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("DetectRes")->type()), + ctx.Input("DetectRes")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b15..41644d8cc1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -197,8 +197,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = framework::ToDataType( - ctx.Input(framework::GradVarName("Out"))->type()); + auto input_data_type = + ctx.Input(framework::GradVarName("Out"))->type(); #ifdef PADDLE_WITH_MKLDNN if (platform::CanMKLDNNBeUsed(ctx)) { diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 43af83fd69..8aff911141 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -115,9 +115,8 @@ class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -175,9 +174,8 @@ class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index e80249fc87..1ed8a2ddd1 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -79,9 +79,8 @@ framework::OpKernelType FCOp::GetExpectedKernelType( library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; } - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout, library); } void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { @@ -111,9 +110,8 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType( library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; } - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout, library); } void FCOpMaker::Make() { diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 252f313440..38cb33e790 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -59,9 +59,9 @@ class FillConstantOp : public framework::OperatorBase { if (force_cpu) { auto cpu = platform::CPUPlace(); - tensor->mutable_data(cpu, framework::ToTypeIndex(data_type)); + tensor->mutable_data(cpu, data_type); } else { - tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type)); + tensor->mutable_data(dev_place, data_type); } platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc index adc7cb1f9e..a885b301e7 100644 --- a/paddle/fluid/operators/fill_op.cc +++ b/paddle/fluid/operators/fill_op.cc @@ -55,7 +55,7 @@ class FillOp : public framework::OperatorBase { static_cast(Attr("dtype")); platform::CPUPlace cpu; auto force_cpu = Attr("force_cpu"); - out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype)); + out.mutable_data(force_cpu ? cpu : place, dtype); framework::LoDTensor tensor; @@ -64,7 +64,7 @@ class FillOp : public framework::OperatorBase { } else { // Always make tensor in CPU memory. tensor.Resize(out.dims()); - tensor.mutable_data(cpu, framework::ToTypeIndex(dtype)); + tensor.mutable_data(cpu, dtype); } framework::VisitDataType( diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc index 3771aac0df..0fbf564b7e 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc @@ -135,9 +135,8 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx.Input("X")->type(), ctx.Input("Y")->type(), "The element's type of input should be the same."); - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -324,9 +323,8 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type_index = ctx.Input("Y")->type(); - auto input_data_type = framework::ToDataType(input_data_type_index); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Y")->type(), + ctx.GetPlace()); } }; } // namespace operators diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 1eb6523a2d..f1466f17fe 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -115,8 +115,7 @@ void FusedEmbeddingFCLSTMOp::InferShape( framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Embeddings")->type()), + ctx.Input("Embeddings")->type(), ctx.device_context()); } diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 25b7ae7c28..4ce67e16dd 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -93,9 +93,8 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FusionGRUOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void FusionGRUOpMaker::Make() { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 8021a896ce..c4e752e3f0 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -117,9 +117,8 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void FusionLSTMOpMaker::Make() { diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc index 40bba09f3e..b05329cfd0 100644 --- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc @@ -61,9 +61,8 @@ void FusionSeqConvEltAddReluOp::InferShape( framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void FusionSeqConvEltAddReluOpMaker::Make() { diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 17ed9771d0..aaef46de0d 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -67,9 +67,8 @@ void FusionSeqExpandConcatFCOp::InferShape( framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.MultiInput("X")[0]->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), + ctx.device_context()); } void FusionSeqExpandConcatFCOpMaker::Make() { diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 95aa9b573c..0a8c0814a7 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -42,9 +42,8 @@ class GatherOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -60,9 +59,8 @@ class GatherGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index e76eb6893b..14a2524bd8 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -63,9 +63,9 @@ class GridSampleOp : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - framework::DataLayout::kAnyLayout, library_); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); } }; @@ -159,9 +159,9 @@ class GridSampleOpGrad : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - framework::DataLayout::kAnyLayout, library_); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); } }; diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 6322659b67..4fa15058f8 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -141,8 +141,7 @@ class GroupNormGradOp : public framework::OperatorWithKernel { if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD"); } - return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.GetPlace()); + return framework::OpKernelType(t->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 0dbcc442df..a807117115 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -76,9 +76,8 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -163,9 +162,8 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 4d25822259..93dd3f794f 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -55,8 +55,8 @@ class InterpolateOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -124,8 +124,8 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index 29b73951bb..ba50bdf34b 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -35,8 +35,7 @@ class IsEmptyOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + ctx.Input("X")->type(), platform::CPUPlace()); return kt; } }; diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 7b42efd623..1312eecfa4 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -40,10 +40,9 @@ class OverflowOp : public framework::OperatorWithKernel { int dtype = -1; auto *x_var = ctx.InputVar("X"); if (x_var->IsType()) { - dtype = framework::ToDataType(x_var->Get().type()); + dtype = x_var->Get().type(); } else if (x_var->IsType()) { - dtype = framework::ToDataType( - x_var->Get().value().type()); + dtype = x_var->Get().value().type(); } else { PADDLE_THROW("Cannot find the input data type by all input data"); } diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index 14ce1da2e9..f83fe355b8 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -153,8 +153,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel { if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD"); } - return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.GetPlace()); + return framework::OpKernelType(t->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index ea1ca7f59d..998b7f09c3 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -184,9 +184,8 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { // is determined by its input "Emission". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Emission")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Emission")->type(), + platform::CPUPlace()); } }; @@ -244,9 +243,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("LogLikelihood")) - ->type()), + ctx.Input(framework::GradVarName("LogLikelihood"))->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 9d1423915a..e28d199eeb 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -69,7 +69,7 @@ class LoadCombineOp : public framework::OperatorBase { // Get data from fin to tensor DeserializeFromStream(*buffer, tensor, dev_ctx); - auto in_dtype = framework::ToDataType(tensor->type()); + auto in_dtype = tensor->type(); auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index df1edc5c2e..06773d1d0e 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -65,7 +65,7 @@ class LoadOp : public framework::OperatorBase { DeserializeFromStream(fin, tensor, dev_ctx); auto load_as_fp16 = Attr("load_as_fp16"); - auto in_dtype = framework::ToDataType(tensor->type()); + auto in_dtype = tensor->type(); auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; if (in_dtype != out_dtype) { diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index 0d4e84e850..7c8fe5fbd7 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -39,9 +39,8 @@ class LoDResetOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -144,9 +143,8 @@ class LoDResetGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 145d2db118..9b91cf5260 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -72,7 +72,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor { LoDTensorToArrayFunctorImpl func; func.prev_functor_ = this; func.dev_ctx_ = dev_ctx; - framework::VisitDataType(framework::ToDataType(input_.type()), func); + framework::VisitDataType(input_.type(), func); } }; diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc index 1b55527fd3..4840a7ac1e 100644 --- a/paddle/fluid/operators/lookup_sparse_table_op.cc +++ b/paddle/fluid/operators/lookup_sparse_table_op.cc @@ -63,8 +63,7 @@ class LookupSparseTableOp : public framework::OperatorBase { out_shape[0] = ids_t.numel(); out_t->Resize(out_shape); out_t->mutable_data(cpu, w_t->value().type()); - PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()), - framework::proto::VarType::FP32, + PADDLE_ENFORCE_EQ(w_t->value().type(), framework::proto::VarType::FP32, "The sparse table only support FP32"); w_t->Get(ids_t, out_t, true, is_test); out_t->set_lod(ids_t.lod()); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index a3bb2be5c7..06ac31b5f1 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -145,9 +145,8 @@ framework::OpKernelType GetExpectedLRNKernel( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace(), + layout_, library_); } } // namespace diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 3225bf9bb6..4a199d681f 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -96,8 +96,7 @@ class LSTMOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; @@ -261,8 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc index e398b51480..7a62bc9f82 100644 --- a/paddle/fluid/operators/lstmp_op.cc +++ b/paddle/fluid/operators/lstmp_op.cc @@ -113,8 +113,7 @@ class LSTMPOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; @@ -312,8 +311,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 854c8653ff..e1491a8156 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -77,16 +77,14 @@ template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstantCPU(tensor, value)); + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); } template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstantCPU(tensor, value)); + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); } struct TensorSetConstantWithPlace : public boost::static_visitor { diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 9372d63f0b..4645b3ae6e 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -65,7 +65,7 @@ template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), + framework::VisitDataType(tensor->type(), TensorSetConstantGPU(context, tensor, value)); } diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc index a60f245f53..bb290046f3 100644 --- a/paddle/fluid/operators/mean_iou_op.cc +++ b/paddle/fluid/operators/mean_iou_op.cc @@ -44,9 +44,8 @@ class MeanIoUOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Predictions")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Predictions")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 820636defa..35b6d7b5e3 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -61,9 +61,7 @@ class MeanGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); - + auto input_data_type = ctx.Input("X")->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 2dc1467b0d..da7fa1b81d 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -63,9 +63,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { platform::Place place = dev_place; int64_t batch_size = in_true.dims()[0] + in_false.dims()[0]; - - std::type_index data_type = - in_true.IsInitialized() ? in_true.type() : in_false.type(); + auto data_type = in_true.IsInitialized() ? in_true.type() : in_false.type(); int rank; framework::DDim in_dims; if (in_true.IsInitialized()) { diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 95aa76bc69..7db6dff297 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -55,9 +55,8 @@ class AccuracyOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Out")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Out")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 335d4fded4..5e33dd9606 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -51,9 +51,8 @@ class AucOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Predict")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Predict")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc index 0d733c47dd..1a67b13491 100644 --- a/paddle/fluid/operators/metrics/precision_recall_op.cc +++ b/paddle/fluid/operators/metrics/precision_recall_op.cc @@ -82,9 +82,8 @@ class PrecisionRecallOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("MaxProbs")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("MaxProbs")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 18ad46cb5e..1801f2915e 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -53,9 +53,8 @@ class MultiplexOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.MultiInput("X")[0]->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), + ctx.device_context()); } }; @@ -123,9 +122,8 @@ class MultiplexGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.MultiInput("X")[0]->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821d..06c35c789f 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -69,9 +69,8 @@ class NCEOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Input")->type(), + platform::CPUPlace()); } }; @@ -214,9 +213,8 @@ class NCEOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Input")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index 9039d02b67..dd365629fc 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -70,9 +70,8 @@ class AdadeltaOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc index e8d5a9e2c8..bd1bb98e63 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/adagrad_op.cc @@ -59,9 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39a..5eae503461 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -75,8 +75,7 @@ class AdamOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); + auto input_data_type = ctx.Input("Param")->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index 4b244a76dc..aef1fc972c 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -76,9 +76,8 @@ class AdamaxOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc index 80278441c0..07899278f9 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc @@ -64,9 +64,8 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc index 1c9e91d9b6..c1a4f5790b 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cc +++ b/paddle/fluid/operators/optimizers/ftrl_op.cc @@ -66,8 +66,7 @@ class FTRLOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); + auto input_data_type = ctx.Input("Param")->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc index 7b07b3b707..9dd9b8afbd 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc @@ -58,9 +58,8 @@ class ProximalAdagradOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc index dcef4f7be2..fccfc2b458 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc @@ -46,9 +46,8 @@ class ProximalGDOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index a9da21f479..6ef2dacb38 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -511,8 +511,8 @@ class Pad2dOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -612,8 +612,8 @@ class Pad2dOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 685ebc3937..3f827c26fd 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -47,9 +47,8 @@ class PadConstantLikeOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Y")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Y")->type(), + ctx.device_context()); } }; @@ -171,9 +170,8 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Y")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Y")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 52b607df74..6259954849 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -99,9 +99,8 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace(), + layout_, library_); } void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { @@ -130,7 +129,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( } #endif - auto input_data_type = framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = ctx.Input("X")->type(); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index 873706593e..179ee96e01 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -71,9 +71,8 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -92,9 +91,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc index 4d865b7f17..99256e408d 100644 --- a/paddle/fluid/operators/positive_negative_pair_op.cc +++ b/paddle/fluid/operators/positive_negative_pair_op.cc @@ -87,9 +87,8 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Score")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Score")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 64d94ab604..62c55c4f55 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,9 +56,8 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -113,9 +112,8 @@ class PReluGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc index e7f1caf4d3..6a5bf17060 100644 --- a/paddle/fluid/operators/print_op.cc +++ b/paddle/fluid/operators/print_op.cc @@ -172,7 +172,7 @@ class TensorPrintOp : public framework::OperatorBase { formater.name = printed_var_name; } if (Attr("print_tensor_type")) { - formater.dtype = printed_tensor.type(); + formater.dtype = framework::ToTypeIndex(printed_tensor.type()); } if (Attr("print_tensor_shape")) { auto &dims = printed_tensor.dims(); diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index 123fa44fa3..cd3bd32adb 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -22,9 +22,8 @@ class RandomCropOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index e17c2ffd39..f771cebd0c 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -99,10 +99,10 @@ void BatchReader::ReadNextImpl(std::vector* out) { out->reserve(out_num); for (size_t j = 0; j < out_num; ++j) { // Merge shape and check date type - std::type_index batch_type = buffer_[0][j].type(); + auto batch_type = buffer_[0][j].type(); framework::DDim batch_shape = buffer_[0][j].dims(); for (size_t i = 1; i < buffer_.size(); ++i) { - std::type_index ins_type = buffer_[i][j].type(); + auto ins_type = buffer_[i][j].type(); framework::DDim ins_shape = buffer_[i][j].dims(); PADDLE_ENFORCE_EQ(batch_type, ins_type); PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 162bfcbb08..a1e02a3fd0 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -414,7 +414,7 @@ class RecurrentGradOp : public RecurrentBase { auto &inside_tensor = cur_scope.FindVar(inside_grad_name) ->Get(); framework::AttributeMap attrs; - attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["dtype"] = inside_tensor.type(); attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); attrs["value"] = 0.0f; diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 500d86fec3..289d848ea1 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -108,9 +108,8 @@ class ReshapeOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -189,9 +188,8 @@ class ReshapeGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -322,9 +320,7 @@ class Reshape2GradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), + ctx.Input(framework::GradVarName("Out"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc index 0fb7776fd9..834dd1eabd 100644 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -99,7 +99,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { auto &in_var_tensor = in_var->Get(); framework::AttributeMap attrs; - attrs["dtype"] = framework::ToDataType(in_var_tensor.type()); + attrs["dtype"] = in_var_tensor.type(); attrs["shape"] = framework::vectorize2int(in_var_tensor.dims()); attrs["value"] = 0.0f; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 79f189222e..6857b5ed9d 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -62,9 +62,8 @@ class ROIAlignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -83,9 +82,8 @@ class ROIAlignGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 3f6b2e46c7..e46d92d6fc 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -69,9 +69,8 @@ class ROIPoolOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -90,9 +89,8 @@ class ROIPoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index 5b05f757c0..a0b9fa305d 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -75,7 +75,7 @@ class SaveCombineOp : public framework::OperatorBase { // Serialize tensors one by one // Check types to see if a fp16 transformation is required - auto in_dtype = framework::ToDataType(tensor.type()); + auto in_dtype = tensor.type(); auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index e79cffcf49..e1c9fd8ff1 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -85,7 +85,7 @@ class SaveOp : public framework::OperatorBase { filename); auto save_as_fp16 = Attr("save_as_fp16"); - auto in_dtype = framework::ToDataType(tensor.type()); + auto in_dtype = tensor.type(); auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; if (in_dtype != out_dtype) { diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index c32d2603cf..ad418d51bc 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -51,9 +51,8 @@ class ScatterOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -70,9 +69,8 @@ class ScatterGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index 44b09bf7c2..1754221e77 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -114,9 +114,8 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index c49d1ccb18..8267c04f9f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -112,9 +112,8 @@ class SequenceScatterOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; @@ -131,9 +130,8 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc index 6f84023e26..35f49f78ce 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc @@ -50,9 +50,8 @@ class SequenceSliceOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -71,9 +70,8 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index 644a5bebc1..027073e5d7 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -51,7 +51,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { } std::string data_format = ctx.Attr("data_format"); return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + ctx.Input("X")->type(), ctx.GetPlace(), framework::StringToDataLayout(data_format), library_); } }; @@ -146,7 +146,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { } std::string data_format = ctx.Attr("data_format"); return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + ctx.Input("X")->type(), ctx.GetPlace(), framework::StringToDataLayout(data_format), library_); } }; diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc index 9612f82b6d..21871d7656 100644 --- a/paddle/fluid/operators/similarity_focus_op.cc +++ b/paddle/fluid/operators/similarity_focus_op.cc @@ -70,9 +70,8 @@ class SimilarityFocusOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index e55462d6cf..789e61b2d3 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -59,9 +59,8 @@ class SliceOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 091ce4e6e8..bc889a5a04 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -62,8 +62,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { } #endif - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = ctx.Input("X")->type(); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "float16 can only be used on GPU place"); @@ -169,8 +168,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { layout_ = framework::DataLayout::kMKLDNN; } #endif - auto input_data_type = framework::ToDataType( - ctx.Input(framework::GradVarName("Out"))->type()); + auto input_data_type = + ctx.Input(framework::GradVarName("Out"))->type(); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "float16 can only be used on GPU place"); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 2900221485..0397c7791e 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -131,9 +131,8 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Logits")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Logits")->type(), + ctx.device_context()); } }; @@ -173,8 +172,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Loss"))->type()), + ctx.Input(framework::GradVarName("Loss"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 7df14158f3..4f717a4355 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -91,9 +91,9 @@ class SumOp : public framework::OperatorWithKernel { continue; } if (dtype == -1) { - dtype = framework::ToDataType(tensor->type()); + dtype = tensor->type(); } else { - PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type())); + PADDLE_ENFORCE_EQ(dtype, tensor->type()); } } PADDLE_ENFORCE_NE(dtype, -1, @@ -106,8 +106,8 @@ class SumOp : public framework::OperatorWithKernel { for (auto& var : x_vars) { auto& value = var->Get().value(); if (value.IsInitialized()) { - return framework::OpKernelType(framework::ToDataType(value.type()), - ctx.device_context(), layout, library); + return framework::OpKernelType(value.type(), ctx.device_context(), + layout, library); } } // if input sparse vars are not initialized, use an default kernel type. @@ -118,9 +118,8 @@ class SumOp : public framework::OperatorWithKernel { auto& array = x_var->Get(); for (auto& each : array) { if (each.numel() != 0) { - return framework::OpKernelType(framework::ToDataType(each.type()), - ctx.device_context(), layout, - library); + return framework::OpKernelType(each.type(), ctx.device_context(), + layout, library); } } } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 6eef4c98c4..5b2aad55a4 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -76,10 +76,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input0 = ctx.Inputs("Xs").front(); framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType(ctx.scope() - .FindVar(input0) - ->GetMutable() - ->type()), + ctx.scope().FindVar(input0)->GetMutable()->type(), ctx.GetPlace()); return kt; } diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index bbd71db606..bc1f59bc1a 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -144,9 +144,8 @@ class Transpose2Op : public TransposeOp { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -194,9 +193,7 @@ class Transpose2OpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), + ctx.Input(framework::GradVarName("Out"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 6d2ccb38f6..11e505d6df 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -74,9 +74,8 @@ class UnpoolOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } public: @@ -113,9 +112,8 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } public: diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index 6a257cebf5..e2ae7caae1 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -56,9 +56,8 @@ class WarpCTCOp : public framework::OperatorWithKernel { } #endif framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Logits")->type()), - ctx.device_context(), layout_, library_); + return framework::OpKernelType(ctx.Input("Logits")->type(), + ctx.device_context(), layout_, library_); } }; @@ -136,9 +135,8 @@ class WarpCTCGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Logits")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Logits")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index e7597f7324..60508f7ab8 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -64,9 +64,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; @@ -180,9 +179,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 7c539d25f6..cbb090adef 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -20,6 +20,7 @@ #include // NOLINT #include #include +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" @@ -28,14 +29,14 @@ namespace paddle { namespace platform { -inline ncclDataType_t ToNCCLDataType(std::type_index type) { - if (type == typeid(float)) { // NOLINT +inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { + if (type == framework::proto::VarType::FP32) { return ncclFloat; - } else if (type == typeid(double)) { // NOLINT + } else if (type == framework::proto::VarType::FP64) { return ncclDouble; - } else if (type == typeid(int)) { // NOLINT + } else if (type == framework::proto::VarType::INT32) { return ncclInt; - } else if (type == typeid(int64_t)) { // NOLINT + } else if (type == framework::proto::VarType::INT64) { return ncclInt64; } else { PADDLE_THROW("Not supported"); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index dca0c01ab2..314ab98625 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -206,7 +206,7 @@ PYBIND11_MODULE(core, m) { .def("_get_float_element", TensorGetElement) .def("_set_double_element", TensorSetElement) .def("_get_double_element", TensorGetElement) - .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); }); + .def("_dtype", [](Tensor &self) { return self.type(); }); py::class_(m, "LoDTensor", R"DOC( LoDTensor is a Tensor with optional LoD information. diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index f67f40f19f..5e91f5b301 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -43,7 +43,7 @@ template struct CastToPyBufferImpl { using CUR_TYPE = typename std::tuple_element>::type; pybind11::buffer_info operator()(const framework::Tensor &tensor) { - if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) { + if (framework::DataTypeTrait::DataType == tensor.type()) { auto dim_vec = framework::vectorize(tensor.dims()); std::vector dims_outside; std::vector strides; From 06f8aa5b97be564b878848acd216069e23081300 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 03:08:21 +0000 Subject: [PATCH 246/719] remove while_op support temporarily test=develop --- paddle/fluid/framework/executor.cc | 3 +- .../fluid/operators/controlflow/while_op.cc | 46 +------------------ 2 files changed, 3 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 767bbb524f..7eab876015 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -419,7 +419,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - if (max_memory_size >= 0) { + // skip while_op and while_grad_op temporarily + if (max_memory_size >= 0 && !keep_kids) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 06920a47ee..5ab0918c48 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -365,51 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); - /* The following codes are used in eager deletion mode */ - std::unordered_set bwd_skip_vars; - if (framework::GetEagerDeletionThreshold() >= 0) { - std::unordered_set fwd_skip_vars; - for (auto *op_desc : grad_block->AllOps()) { - auto skippable = [&](const std::string &name) { - return !grad_block->HasVar(name) && - (fwd_block->HasVarRecursive(name) || - parent_block->HasVarRecursive(name)); - }; - for (auto &in_arg_name : op_desc->InputArgumentNames()) { - if (skippable(in_arg_name)) { - fwd_skip_vars.insert(in_arg_name); - } - } - - for (auto &out_arg_name : op_desc->OutputArgumentNames()) { - if (skippable(out_arg_name)) { - fwd_skip_vars.insert(out_arg_name); - } - } - } - - if (!fwd_skip_vars.empty()) { - // FIXME(zjl): ugly const_cast here, maybe we should find a better way - // to modify forward while_op - auto &fwd_while_op = const_cast(ForwardOp()); - fwd_while_op.SetAttr(kSkipEagerDeletionVars, - std::vector(fwd_skip_vars.begin(), - fwd_skip_vars.end())); - } - - // Find backward skip vars - auto fwd_input = Input(kX); - for (size_t i = 0; i < igs.size(); ++i) { - if (igs[i] == framework::kEmptyVarName) { - continue; - } - bwd_skip_vars.insert(igs[i]); - bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); - } - } - while_grad->SetAttr( - kSkipEagerDeletionVars, - std::vector(bwd_skip_vars.begin(), bwd_skip_vars.end())); + while_grad->SetAttr(kSkipEagerDeletionVars, std::vector()); return std::unique_ptr(while_grad); } From e240ba291853856d29790ecd3b6c5493c5ab2cd3 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 03:16:34 +0000 Subject: [PATCH 247/719] implement backward test=develop --- paddle/fluid/framework/op_desc.cc | 2 + paddle/fluid/framework/op_desc.h | 2 + paddle/fluid/framework/operator.cc | 5 + paddle/fluid/framework/shape_inference.h | 5 + paddle/fluid/operators/py_func_op.cc | 127 ++++++++++++++++++++--- paddle/fluid/pybind/protobuf.cc | 2 +- python/paddle/fluid/layers/nn.py | 39 ++++--- 7 files changed, 154 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index e8ecd90502..f8a9340df5 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -34,6 +34,8 @@ class CompileTimeInferShapeContext : public InferShapeContext { public: CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block); + InferShapeOpPtr GetOp() const override { return &op_; } + bool HasInput(const std::string &name) const override; bool HasOutput(const std::string &name) const override; diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 30c8a26c3d..3b3f50bfa7 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -121,6 +121,8 @@ class OpDesc { BlockDesc *Block() { return this->block_; } + const BlockDesc *Block() const { return this->block_; } + private: template static std::vector MapKeys(const MapType &map) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f..188ab120be 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -481,6 +481,8 @@ class RuntimeInferShapeContext : public InferShapeContext { RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) : op_(op), scope_(scope) {} + InferShapeOpPtr GetOp() const override { return &op_; } + bool HasInput(const std::string& name) const override { // has only one input const auto& ins = op_.Inputs(); @@ -879,6 +881,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { + PADDLE_ENFORCE(t->IsInitialized(), + "Input %s(%s) does not exist in Operator %s", + input.first, ipt_name, DebugString()); int tmp = static_cast(ToDataType(t->type())); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index d73cca121e..2f95ab353e 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -25,7 +25,10 @@ limitations under the License. */ namespace paddle { namespace framework { +class OperatorBase; + using InferShapeVarPtr = boost::variant; +using InferShapeOpPtr = boost::variant; class InferShapeContext { public: @@ -38,6 +41,8 @@ class InferShapeContext { std::vector GetOutputsVarType( const std::string &name) const; + virtual InferShapeOpPtr GetOp() const = 0; + virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 86914f3060..46a6125f97 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -24,34 +24,34 @@ namespace operators { namespace py = pybind11; -static std::mutex g_py_callables_mtx; static std::vector g_py_callables; size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { - std::lock_guard guard(g_py_callables_mtx); g_py_callables.emplace_back(py_obj); return g_py_callables.size() - 1; } static py::object *GetPythonCallableObject(size_t i) { - std::lock_guard guard(g_py_callables_mtx); PADDLE_ENFORCE_LT(i, g_py_callables.size()); return &g_py_callables[i]; } -void DoCallPythonFunc(py::object *callable, const std::string &func_token, - const std::vector &ins, - std::vector *out) { +void CallPythonFunc(py::object *callable, const std::string &func_token, + const std::vector &ins, + std::vector *out) { py::gil_scoped_acquire guard{}; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { - in_args[i] = py::cast(ins[i]); + in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); } auto ret = (*callable)(func_token, *in_args); auto ret_tuple = py::cast(ret); PADDLE_ENFORCE_EQ(py::len(ret_tuple), out->size(), "Output number not match"); for (size_t i = 0; i < out->size(); ++i) { + if ((*out)[i] == nullptr) { + continue; + } try { auto *out_tensor = py::cast(ret_tuple[i]); PADDLE_ENFORCE_NOT_NULL(out_tensor, @@ -67,8 +67,43 @@ void DoCallPythonFunc(py::object *callable, const std::string &func_token, class PyFuncOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(!ctx->IsRuntime(), + "Infer shape cannot be called in runtime."); PADDLE_ENFORCE(ctx->HasInputs("X"), "Input(X) must exist"); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "Output(Out) must exist"); + + auto *op = boost::get(ctx->GetOp()); + auto *block = op->Block(); + // No need to infer shape in forward part + if (block->ForwardBlockID() < 0) { + return; + } + + PADDLE_ENFORCE(!ctx->Attrs().Get("token").empty(), + "Function token cannot be empty"); + + const std::string kGradVarSuffix = framework::kGradVarSuffix; + auto out_vars = ctx->GetOutputVarPtrs("Out"); + for (auto &out_var : out_vars) { + auto *out_var_desc = boost::get(out_var); + auto out_name = out_var_desc->Name(); + if (out_name == framework::kEmptyVarName || + out_name.size() < kGradVarSuffix.size()) { + continue; + } + + size_t len = out_name.size() - kGradVarSuffix.size(); + if (out_name.substr(len) == kGradVarSuffix) { + auto fwd_var_name = out_name.substr(0, len); + auto *in_var_desc = block->FindVarRecursive(fwd_var_name); + PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found", + fwd_var_name); + out_var_desc->SetShape(in_var_desc->GetShape()); + out_var_desc->SetDataType(in_var_desc->GetDataType()); + out_var_desc->SetLoDLevel(in_var_desc->GetLoDLevel()); + out_var_desc->SetType(in_var_desc->GetType()); + } + } } }; @@ -77,12 +112,68 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Inputs of py_func op.").AsDuplicable(); AddOutput("Out", "Outputs of py_func op").AsDuplicable(); - AddAttr("token", "function token"); - AddAttr("handle_idx", "handle index").SetDefault(0); + AddAttr("handle_idx", "Index of the registered py_func handle") + .SetDefault(0); + AddAttr("token", "Token of function token to be called") + .SetDefault(""); + AddAttr("backward_token", + "Token of backward function to be called") + .SetDefault(""); AddComment(R"DOC("PyFunc Op")DOC"); } }; +class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { + public: + using framework::GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + auto &fwd_attrs = Attrs(); + if (fwd_attrs.at("backward_token").empty()) { + return {}; + } + + std::unique_ptr grad_op(new framework::OpDesc()); + grad_op->SetType("py_func"); + + framework::AttributeMap bwd_attrs; + bwd_attrs["token"] = fwd_attrs.at("backward_token"); + bwd_attrs["backward_token"] = std::string(""); + grad_op->SetAttrMap(bwd_attrs); + + auto bwd_in = Input("X"); + auto fwd_out = Output("Out"); + auto fwd_out_grad = OutputGrad("Out"); + bwd_in.insert(bwd_in.end(), fwd_out.begin(), fwd_out.end()); + bwd_in.insert(bwd_in.end(), fwd_out_grad.begin(), fwd_out_grad.end()); + + auto bwd_out = InputGrad("X", false); + + if (VLOG_IS_ON(10)) { + std::string in_str = "PyFunc Grad Input: "; + for (auto &in : bwd_in) { + in_str += in; + in_str += " "; + } + VLOG(10) << in_str; + + std::string out_str = "PyFunc Grad Output: "; + for (auto &out : bwd_out) { + out_str += out; + out += " "; + } + VLOG(10) << out_str; + } + + grad_op->SetInput("X", bwd_in); + grad_op->SetOutput("Out", InputGrad("X", false)); + + std::vector> ret(1); + ret[0] = std::move(grad_op); + return ret; + } +}; + class PyFuncOp : public framework::OperatorBase { public: using framework::OperatorBase::OperatorBase; @@ -95,8 +186,14 @@ class PyFuncOp : public framework::OperatorBase { std::vector inputs(in_arg_names.size()); for (size_t i = 0; i < in_arg_names.size(); ++i) { - auto &in_tensor = - scope.FindVar(in_arg_names[i])->Get(); + auto in_var = scope.FindVar(in_arg_names[i]); + if (in_var == nullptr) { + continue; + } + auto &in_tensor = in_var->Get(); + if (!in_tensor.IsInitialized()) { + continue; + } if (platform::is_gpu_place(in_tensor.place())) { framework::TensorCopySync(in_tensor, platform::CPUPlace(), &inputs[i]); } else { @@ -107,8 +204,9 @@ class PyFuncOp : public framework::OperatorBase { std::vector outputs(out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { + auto *out_var = scope.FindVar(out_arg_names[i]); auto *out_tensor = - scope.FindVar(out_arg_names[i])->GetMutable(); + out_var ? out_var->GetMutable() : nullptr; outputs[i] = out_tensor; } @@ -117,7 +215,7 @@ class PyFuncOp : public framework::OperatorBase { auto *py_callable = GetPythonCallableObject(handle_idx); VLOG(10) << "Call py_func_op with token " << token << ", and handle_idx " << handle_idx; - DoCallPythonFunc(py_callable, token, inputs, &outputs); + CallPythonFunc(py_callable, token, inputs, &outputs); } }; @@ -127,5 +225,4 @@ class PyFuncOp : public framework::OperatorBase { namespace ops = paddle::operators; REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker, - ops::PyFuncOpShapeInference, - paddle::framework::EmptyGradOpMaker); + ops::PyFuncOpShapeInference, ops::PyFuncOpGradDescMaker); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index ac406b27b5..4b218fb3a2 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -328,7 +328,7 @@ void BindOpDesc(pybind11::module *m) { .def("infer_var_type", &pd::OpDesc::InferVarType) .def("set_is_target", &pd::OpDesc::SetIsTarget) .def("serialize_to_string", SerializeMessage) - .def("block", &pd::OpDesc::Block, + .def("block", [](pd::OpDesc &self) { return self.Block(); }, pybind11::return_value_policy::reference); } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 92cd53a6c3..66c98c935d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9096,12 +9096,9 @@ def py_func(func, x, out, backward_func=None): _main_program_to_register = dict() @classmethod - def get_instance(cls, prog=None): - if prog is None: - prog = fluid.default_main_program() - + def get_instance(cls, prog): if not isinstance(prog, Program): - raise ValueError("prog must be None or type of Program") + raise TypeError("prog must be type of Program") ret = cls._main_program_to_register.get(prog, None) if ret is None: @@ -9155,6 +9152,10 @@ def py_func(func, x, out, backward_func=None): ret = [] for i in six.moves.range(len(ret0)): + if ret0[i] is None: + ret.append(None) + continue + if isinstance(ret0[i], core.LoDTensor): ret.append(ret0[i]) continue @@ -9175,20 +9176,34 @@ def py_func(func, x, out, backward_func=None): x = [x] if isinstance(out, Variable): - out = [out] + out_list = [out] + else: + out_list = out + + if func is None or not hasattr(func, '__call__'): + raise TypeError('Input func must be a function') - for each_out in out: + if backward_func is not None and not hasattr(backward_func, '__call__'): + raise TypeError('Input backward_func must be a function') + + for each_out in out_list: if len(each_out.shape) == 0: raise ValueError( - 'users should infer shapes of outputs of py_func op manually') + 'Output shapes of py_func op should be provided by users manually' + ) py_func_reg = PyFuncRegister.get_instance(helper.main_program) - token = py_func_reg.unique_token(func) + forward_token = py_func_reg.unique_token(func) + backward_token = py_func_reg.unique_token( + backward_func) if backward_func is not None else '' helper.append_op( type='py_func', inputs={'X': x}, - outputs={'Out': out}, - attrs={'handle_idx': py_func_reg.handle_idx, - 'token': token}) + outputs={'Out': out_list}, + attrs={ + 'handle_idx': py_func_reg.handle_idx, + 'token': forward_token, + 'backward_token': backward_token + }) return out From c00e07cda02ce611f0c10ed9cbc64f9a59f42f73 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 12 Dec 2018 14:58:51 +0800 Subject: [PATCH 248/719] Fix distribute compile test=develop --- .../fluid/framework/details/reduce_op_handle.cc | 10 +++++----- paddle/fluid/operators/distributed/grpc_serde.cc | 3 +-- .../operators/distributed/sendrecvop_utils.cc | 6 ++---- .../operators/distributed/sendrecvop_utils.h | 13 +++++++------ .../operators/distributed/variable_response.cc | 15 +++++++-------- .../operators/distributed_ops/merge_ids_op.cc | 4 +--- .../distributed_ops/ref_by_trainer_id_op.cc | 4 +--- 7 files changed, 24 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 85d8abc910..7a5f7de57e 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -218,18 +218,18 @@ void ReduceOpHandle::RunImpl() { } #if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE - if (framework::IsType(in_selected_rows[0]->value().type())) { + if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP32) { GatherSelectedRows( in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, out_var->GetMutable()); - } else if (framework::IsType( - in_selected_rows[0]->value().type())) { + } else if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP64) { GatherSelectedRows( in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, out_var->GetMutable()); } else { - PADDLE_ENFORCE(false, - "only support double or float when gahter SelectedRows"); + PADDLE_THROW("only support double or float when gather SelectedRows"); } #endif }); diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 31fac2133c..94bf0a113b 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -122,8 +122,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); ProtoEncodeHelper e2(static_cast(buf), 128); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 6ba883ba01..5fd42e884a 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -61,8 +61,7 @@ TensorPayload GetTensorPayload(framework::Variable* var, auto tensor = var->Get(); // FIXME(wuyi): data types in send_recv.proto is copied from // framework.proto - request->set_data_type( - static_cast(framework::ToDataType(tensor.type()))); + request->set_data_type(static_cast(tensor.type())); for (auto& dim : framework::vectorize(tensor.dims())) { request->add_dims(dim); } @@ -83,8 +82,7 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request) { auto* slr = var->GetMutable(); - request->set_data_type( - static_cast(framework::ToDataType(slr->value().type()))); + request->set_data_type(static_cast(slr->value().type())); request->set_lod_level(0); request->set_slr_height(slr->height()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 523e56fe3e..710c839166 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -58,18 +58,19 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request); -inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { +inline framework::proto::VarType::Type ToVarType( + sendrecv::VariableMessage::Type type) { switch (type) { case sendrecv::VariableMessage::FP32: - return typeid(float); // NOLINT + return framework::proto::VarType::FP32; // NOLINT case sendrecv::VariableMessage::FP64: - return typeid(double); // NOLINT + return framework::proto::VarType::FP64; // NOLINT case sendrecv::VariableMessage::INT32: - return typeid(int); // NOLINT + return framework::proto::VarType::INT32; // NOLINT case sendrecv::VariableMessage::INT64: - return typeid(int64_t); // NOLINT + return framework::proto::VarType::INT64; // NOLINT case sendrecv::VariableMessage::BOOL: - return typeid(bool); // NOLINT + return framework::proto::VarType::BOOL; // NOLINT default: PADDLE_THROW("Not support type %d", type); } diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 5b2be04e6a..921c96b583 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -114,7 +114,7 @@ bool VariableResponse::CopyLodTensorData( tensor->set_lod(lod); void* tensor_data = - tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); + tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type())); VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() << ", Buffer Size = " << length; @@ -139,13 +139,13 @@ bool VariableResponse::CopySelectRowsTensorData( slr->set_height(meta_.slr_height()); auto* tensor = slr->mutable_value(); tensor->Resize(dims); - PADDLE_ENFORCE_EQ(static_cast(tensor->numel()), - length / framework::SizeOfType( - paddle::operators::distributed::ToTypeIndex( - meta_.data_type()))); + PADDLE_ENFORCE_EQ( + static_cast(tensor->numel()), + length / framework::SizeOfType(paddle::operators::distributed::ToVarType( + meta_.data_type()))); void* tensor_data = tensor->mutable_data( ctx.GetPlace(), - paddle::operators::distributed::ToTypeIndex(meta_.data_type())); + paddle::operators::distributed::ToVarType(meta_.data_type())); if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { return false; @@ -159,8 +159,7 @@ bool VariableResponse::CopySelectRowsData( const platform::DeviceContext& ctx, int length) { auto* slr = GetVar()->GetMutable(); slr->mutable_rows()->clear(); - slr->mutable_rows()->resize(length / - framework::SizeOfType(typeid(int64_t))); // int64 + slr->mutable_rows()->resize(length / sizeof(int64_t)); // int64 int64_t* rows_data = slr->mutable_rows()->data(); // copy rows CPU data, GPU data will be copied lazily. diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc index 252a63cb60..da0185b8c4 100644 --- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc +++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc @@ -108,9 +108,7 @@ class MergeIdsOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.MultiInput("X").front()->type()), - ctx.GetPlace()); + ctx.MultiInput("X").front()->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc index 98b0af7688..7e16e6ff66 100644 --- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc +++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc @@ -42,9 +42,7 @@ class RefByTrainerIdOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.MultiInput("X")[0]->type()), - ctx.GetPlace()); + ctx.MultiInput("X")[0]->type(), ctx.GetPlace()); } }; From 6951ef9a55768c8e923623431247825a40bd522a Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 12 Dec 2018 15:25:07 +0800 Subject: [PATCH 249/719] Fix the gelu backward to avoid nan (#14857) * Fix the gelu backward to avoid nan test=develop * Remove unnecessary calls test=develop --- paddle/fluid/operators/activation_op.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 87d549678a..c7df3ea58a 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -301,23 +301,22 @@ template struct GeluFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { - auto temp = - ((x * static_cast(M_SQRT1_2)).erf()).template cast().eval(); + auto temp = (x * static_cast(M_SQRT1_2)).erf(); out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); } }; template struct GeluGradFunctor : BaseActivationFunctor { - bool Inplace() const { return IsInplace("gelu"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp = (static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - ((-static_cast(0.5) * x.square()).exp())) - .template cast() - .eval(); - dx.device(d) = dout * (out / x + temp); + auto first = static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); } }; From faffc25c19cdc9504214a4c0c85aa131a44079de Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 15:28:27 +0800 Subject: [PATCH 250/719] fix hadoop home bug & refine setup.py --- python/paddle/fluid/async_executor.py | 5 +---- python/setup.py.in | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 2a6a11805e..b077e1be7e 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -153,13 +153,10 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, mode, debug) - def download_data(self, afs_path, local_path, fs_default_name, ugi, process_num=12): - #hadoop_home = "$HADOOP_HOME" + def download_data(self, afs_path, local_path, fs_default_name, ugi, hadoop_home="$HADOOP_HOME", process_num=12): if self.instance is None: raise ValueError('instance is None, please run config_distributed_nodes init instance') - hadoop_home = "~/tools/hadoop-xingtian/hadoop/" - configs = { "fs.default.name": fs_default_name, "hadoop.job.ugi": ugi diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54..9418804be2 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -103,8 +103,10 @@ packages=['paddle', 'paddle.fluid', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', + 'paddle.fluid.distributed', 'paddle.fluid.layers', 'paddle.fluid.contrib', + 'paddle.fluid.contrib.utils', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', 'paddle.fluid.transpiler', From 194ce2e92cf2e77c0f3e544c2b61204044d0af86 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 05:48:02 +0000 Subject: [PATCH 251/719] add benchmark --- paddle/fluid/operators/jit/CMakeLists.txt | 3 +- paddle/fluid/operators/jit/benchmark.cc | 152 ++++++++++++++++++++++ paddle/fluid/operators/jit/gen/jitcode.h | 4 - paddle/fluid/operators/jit/helper.h | 4 + paddle/fluid/operators/jit/test.cc | 9 -- 5 files changed, 158 insertions(+), 14 deletions(-) create mode 100644 paddle/fluid/operators/jit/benchmark.cc diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 26903e0e44..0f213c5898 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -8,7 +8,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -list(REMOVE_ITEM jit_kernel_cc_srcs test.cc) +list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) # refer must go first @@ -20,3 +20,4 @@ endif() cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) +cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc new file mode 100644 index 0000000000..5a276172c3 --- /dev/null +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +// #include // for memcpy +// #include +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/port.h" + +DEFINE_int32(burning, 10, "Burning times."); +DEFINE_int32(repeat, 3000, "Repeat times."); +DEFINE_int32(max_size, 1000, "The Max size would be tested."); + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +template +void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), + const T upper = static_cast(20.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +std::vector TestSizes() { + std::vector s; + for (int i = 1; i <= FLAGS_max_size; ++i) { + s.push_back(i); + } + return s; +} + +// return this function avg time +template +double BenchTartgetFunc(const Func tgt, const std::vector& x, + const std::vector& y, std::vector& z) { // NOLINT + const T* x_data = x.data(); + const T* y_data = y.data(); + const int d = z.size(); + T* z_data = z.data(); + + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(x_data, y_data, z_data, d); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(x_data, y_data, z_data, d); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +// Benchmark all jit kernels including jitcode, mkl and refer. +// To use this tool, run command: ./benchmark [options...] +// Options: +// --burning: the burning time before count +// --repeat: the repeat times +// --max_size: the max size would be tested +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + using T = float; + using PlaceType = paddle::platform::CPUPlace; + namespace jit = paddle::operators::jit; + const auto KT = jit::vmul; + LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat + << " times."; + for (int d : TestSizes()) { + // for (kernels type) { // TODO(TJ): more jit::KernelType + std::vector> infos; + std::vector x(d), y(d), z(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + // refer + auto refer = jit::GetRefer::func_type, + jit::VMulTuples::attr_type>(); + if (refer) { + auto res = + BenchTartgetFunc::func_type>(refer, x, y, z); + infos.push_back(std::make_pair("Refer", res)); + } + + // test jitcode + auto jitcode = jit::GetJitCode::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + if (jitcode) { + auto res = + BenchTartgetFunc::func_type>(jitcode, x, y, z); + infos.push_back(std::make_pair("JitCode", res)); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast::func_type, + jit::VMulTuples::attr_type>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + auto res = + BenchTartgetFunc::func_type>(more, x, y, z); + infos.push_back(std::make_pair("More", res)); + } + } + } + + // Test result from Get function + auto tgt = jit::Get::func_type, + jit::VMulTuples::attr_type, PlaceType>(d); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchTartgetFunc::func_type>(tgt, x, y, z); + infos.push_back(std::make_pair("Target", res)); + + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << KT << ", size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + // } + } +} diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index caa3ef9dda..765952fc35 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -60,10 +60,6 @@ typedef enum { #define YMM_FLOAT_BLOCK 8 #define ZMM_FLOAT_BLOCK 16 -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - #define DECLARE_JIT_CODE(codename) \ const char* name() const override { return #codename; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index b7580f6efb..16cd18e2cc 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -28,6 +28,10 @@ namespace paddle { namespace operators { namespace jit { +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + template inline Func GetJitCode(Attr attr) { diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index e523089101..1ee6ce6b13 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -21,15 +21,6 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" - -constexpr int repeat = 20000; - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), From bc0df6a9487f0aa877647b2a15789fe3eb206616 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 07:54:10 +0000 Subject: [PATCH 252/719] make typename tuples --- paddle/fluid/operators/jit/benchmark.cc | 15 +++++--------- paddle/fluid/operators/jit/helper.h | 24 ++++++++++++----------- paddle/fluid/operators/jit/kernel_base.h | 15 +++++++++----- paddle/fluid/operators/jit/more/mkl/mkl.h | 3 +-- paddle/fluid/operators/jit/refer/refer.h | 3 +-- paddle/fluid/operators/jit/test.cc | 15 +++++--------- 6 files changed, 35 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 5a276172c3..ef7ccc64ad 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -94,8 +94,7 @@ int main(int argc, char* argv[]) { RandomVec(d, x.data()); RandomVec(d, y.data()); // refer - auto refer = jit::GetRefer::func_type, - jit::VMulTuples::attr_type>(); + auto refer = jit::GetRefer>(); if (refer) { auto res = BenchTartgetFunc::func_type>(refer, x, y, z); @@ -103,8 +102,7 @@ int main(int argc, char* argv[]) { } // test jitcode - auto jitcode = jit::GetJitCode::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { auto res = BenchTartgetFunc::func_type>(jitcode, x, y, z); @@ -118,10 +116,8 @@ int main(int argc, char* argv[]) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = - dynamic_cast::func_type, - jit::VMulTuples::attr_type>*>( - impl.get()); + auto i = dynamic_cast>*>( + impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); auto res = @@ -132,8 +128,7 @@ int main(int argc, char* argv[]) { } // Test result from Get function - auto tgt = jit::Get::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto tgt = jit::Get, PlaceType>(d); if (!tgt) { LOG(ERROR) << "Target can not be empty!"; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 16cd18e2cc..11bbd6a56c 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -32,9 +32,11 @@ namespace jit { #define SIGMOID_THRESHOLD_MAX 13.0 #define EXP_MAX_INPUT 40.0 -template -inline Func GetJitCode(Attr attr) { +template +inline typename KernelTuples::func_type GetJitCode( + typename KernelTuples::attr_type attr) { + using Func = typename KernelTuples::func_type; + using Attr = typename KernelTuples::attr_type; size_t key = JitCodeKey(attr); auto& codes = JitCodePool().Instance(); if (codes.Has(key)) { @@ -65,8 +67,8 @@ inline Func GetJitCode(Attr attr) { // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace -template -inline Func GetRefer() { +template +inline typename KernelTuples::func_type GetRefer() { auto& ref_pool = ReferKernelPool().Instance().AllKernels(); KernelKey kkey(KT, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); @@ -74,7 +76,7 @@ inline Func GetRefer() { "Every Kernel should have reference function."); auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i) { return i->GetFunc(); } @@ -82,10 +84,10 @@ inline Func GetRefer() { return nullptr; } -template -Func Get(Attr attr) { - auto jitfunc = GetJitCode(attr); +typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { + auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; } @@ -97,7 +99,7 @@ Func Get(Attr attr) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i && i->UseMe(attr)) { return i->GetFunc(); } @@ -105,7 +107,7 @@ Func Get(Attr attr) { } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + return GetRefer(); } } // namespace jit diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index df7be6ab8e..84f0308898 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -36,10 +36,13 @@ class Kernel { DISABLE_COPY_AND_ASSIGN(Kernel); }; -template +template class KernelImpl : public Kernel { + using T = typename KernelTuples::data_type; + using Func = typename KernelTuples::func_type; + using Attr = typename KernelTuples::attr_type; + public: - using ELEMENT_TYPE = T; virtual Func GetFunc() const { return func; } virtual bool UseMe(Attr attr) const = 0; @@ -47,11 +50,13 @@ class KernelImpl : public Kernel { Func func{nullptr}; }; -template -class ReferKernel : public KernelImpl { +template +class ReferKernel : public KernelImpl { public: // Refer code can always be used - bool UseMe(Attr attr) const override { return true; } + bool UseMe(typename KernelTuples::attr_type attr) const override { + return true; + } }; } // namespace jit diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index c0f738cceb..56469b054d 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -28,8 +28,7 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl::func_type, - typename VMulTuples::attr_type> { +class VMulKernel : public KernelImpl> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 97aa5de8fc..99d1cbd43e 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -29,8 +29,7 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel : public ReferKernel::func_type, - typename VMulTuples::attr_type> { +class VMulKernel : public ReferKernel> { public: VMulKernel() { this->func = VMul; } }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 1ee6ce6b13..4d7970414f 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -89,8 +89,7 @@ TEST(JitKernel, vmul) { namespace jit = paddle::operators::jit; const auto KT = jit::vmul; for (int d : TestSizes()) { - auto ref = jit::GetRefer::func_type, - jit::VMulTuples::attr_type>(); + auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -115,8 +114,7 @@ TEST(JitKernel, vmul) { ExpectEQ(yinp_data, zref_data, d); // test jitcode - auto jitcode = jit::GetJitCode::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { VLOG(10) << "Test jitcode, size: " << d; TestTartgetFunc::func_type>(jitcode, x, y, zref); @@ -129,10 +127,8 @@ TEST(JitKernel, vmul) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = - dynamic_cast::func_type, - jit::VMulTuples::attr_type>*>( - impl.get()); + auto i = dynamic_cast>*>( + impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel, size: " << d; @@ -142,8 +138,7 @@ TEST(JitKernel, vmul) { } // Test result from Get function VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get::func_type, - jit::VMulTuples::attr_type, PlaceType>(d); + auto tgt = jit::Get, PlaceType>(d); TestTartgetFunc::func_type>(tgt, x, y, zref); } } From 7a43e5170325f3a78e026bb4d7039e0c25be8686 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 16:16:26 +0800 Subject: [PATCH 253/719] Add gperf tools --- CMakeLists.txt | 6 ++++ cmake/generic.cmake | 16 +++++++++++ paddle/fluid/framework/parallel_executor.cc | 31 ++++++++++++++++++++- python/paddle/fluid/__init__.py | 3 +- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index efa68c9ba2..3e59aca2d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,12 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) +if (WITH_PROFILER) + find_package(Gperftools REQUIRED) + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + add_definitions(-DWITH_GPERFTOOLS) +endif() + # PY_VERSION if(NOT PY_VERSION) set(PY_VERSION 2.7) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 312fbaa0b3..a8b9dcfcf5 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME) endif() endfunction(find_fluid_modules) + +function(common_link TARGET_NAME) + if (WITH_PROFILER) + target_link_libraries(${TARGET_NAME} gperftools::profiler) + endif() +endfunction() + + # find all third_party modules is used for paddle static library # for reduce the dependency when building the inference libs. set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) @@ -274,6 +282,7 @@ function(cc_library TARGET_NAME) endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + common_link(${TARGET_NAME}) endif() # cpplint code style @@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME) if(cc_binary_DEPS) target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endfunction(cc_binary) @@ -362,6 +372,7 @@ function(cc_test TARGET_NAME) target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) @@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(nv_binary) @@ -433,6 +445,7 @@ function(nv_test TARGET_NAME) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) @@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME) if(hip_binary_DEPS) target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) add_dependencies(${TARGET_NAME} ${hip_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(hip_binary) @@ -518,6 +532,7 @@ function(hip_test TARGET_NAME) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(hip_test) @@ -560,6 +575,7 @@ function(go_library TARGET_NAME) endif() if(go_library_DEPS) add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + common_link(${TARGET_NAME}) endif(go_library_DEPS) # The "source file" of the library is `${dummyfile}` which never diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee77..28a4b14b27 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -30,13 +30,36 @@ limitations under the License. */ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif +DEFINE_string(pe_profile_fname, "", + "Profiler filename for PE, which generated by gperftools." + "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); + namespace paddle { namespace framework { +static std::once_flag gProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gProfileStarted = false; +#endif class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) - : places_(places) {} + : places_(places) { + if (!FLAGS_pe_profile_fname.empty()) { + std::call_once(gProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_pe_profile_fname.c_str()); + gProfileStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_pe_profile_fname will be ignored"; +#endif + }); + } + } ~ParallelExecutorPrivate() { if (own_local_scope_) { @@ -270,6 +293,12 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { +#ifdef WITH_GPERFTOOLS + if (gProfileStarted) { + ProfilerFlush(); + } +#endif + platform::RecordBlock b(0); #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a53519188..4cf0784d81 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -125,7 +125,8 @@ def __bootstrap__(): 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'reader_queue_speed_test_mode', 'print_sub_graph_dir', + 'pe_profile_fname' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') From f3250097bccbade4168b02417e04fee5cd990494 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 08:20:25 +0000 Subject: [PATCH 254/719] fix bug and mac compile --- paddle/fluid/operators/jit/benchmark.cc | 3 +-- paddle/fluid/operators/jit/helper.h | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index ef7ccc64ad..5cc82b69f8 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -12,9 +12,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -// #include // for memcpy -// #include #include +#include #include #include #include "gflags/gflags.h" diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 11bbd6a56c..d1bbe10381 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -49,15 +49,17 @@ inline typename KernelTuples::func_type GetJitCode( // pool: (KernelKey(type, place), vector) auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); auto iter = creator_map.find(kkey); - auto& creators = iter->second; - for (auto& cur : creators) { - auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { - auto p = i->CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; + if (iter != creator_map.end()) { + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; + } } } } From 009c7cf6ccf3f8ece6922d532df38cadd3ca5c84 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 16:23:50 +0800 Subject: [PATCH 255/719] add finialize --- python/paddle/fluid/contrib/utils/__init__.py | 2 +- python/paddle/fluid/distributed/helper.py | 3 +++ python/paddle/fluid/distributed/ps_instance.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py index 2fe9f702f3..20b2cc381a 100644 --- a/python/paddle/fluid/contrib/utils/__init__.py +++ b/python/paddle/fluid/contrib/utils/__init__.py @@ -18,5 +18,5 @@ from __future__ import print_function from . import hdfs_utils from .hdfs_utils import * -__all__ = lookup_table_utils.__all__ +#__all__ = lookup_table_utils.__all__ __all__ = hdfs_utils.__all__ diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 4cc5eb2a92..1244b4c0ca 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -44,5 +44,8 @@ class MPIHelper(object): def get_hostname(self): import socket return socket.gethostname() + + def finalize(self): + MPI.Finalize() diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index 94e123c2ce..dce5dfc5bd 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -97,6 +97,7 @@ class PaddlePSInstance(object): pass def finalize(self): + self.dh.finalize() pass From b75bd29c3ae74b5d48d573916eebab6473b3c30f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 16:51:01 +0800 Subject: [PATCH 256/719] Remove debug info --- .../details/computation_op_handle.cc | 45 +---- .../fluid/framework/details/op_handle_base.cc | 2 +- paddle/fluid/framework/ir/graph.cc | 132 +++++++++------ paddle/fluid/framework/operator.cc | 160 +++++++----------- .../operators/elementwise/elementwise_op.h | 69 ++++---- paddle/fluid/operators/optimizers/adam_op.cc | 79 +++++---- 6 files changed, 224 insertions(+), 263 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 9003033438..7ad1e40c60 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -26,46 +26,17 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_(scope), place_(place) {} -struct RecordTime { - RecordTime(const std::string &name, const std::string &type) - : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} - - ~RecordTime() { - if (type_ == "elementsize_add") { - end_ = std::chrono::system_clock::now(); - std::chrono::duration diff = end_ - start_; - VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); - } - } - - std::string name_; - std::string type_; - std::chrono::system_clock::time_point start_; - std::chrono::system_clock::time_point end_; -}; - void ComputationOpHandle::RunImpl() { - { - RecordTime rt("ComputationOpHandle::RunImpl", "Wait"); - WaitInputVarGenerated(place_); - } - - Scope *scope = nullptr; - { - RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope"); - scope = scope_->FindVar(kLocalExecScopeName)->Get(); - } - - { - RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type()); + WaitInputVarGenerated(place_); - auto run_func = [this, scope]() { op_->Run(*scope, place_); }; + auto run_func = [this]() { + op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); + }; - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); - } + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); } } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 5997f12ffa..4822627ac3 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda && !dev_ctxes_.empty()) { + if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index dfa310a386..9ebf136698 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -20,6 +20,10 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" +DEFINE_bool(enforce_when_check_program, true, + "Checking whether the program is correct or not. We will log " + "errors rather than throwing exceptions if this flag turned off"); + namespace paddle { namespace framework { namespace ir { @@ -28,55 +32,85 @@ namespace { void CheckProgram(const ProgramDesc &program) { #define _INT(role) static_cast(role) -// std::map visit; -// for (OpDesc *op : program.Block(0).AllOps()) { -// // For backward compatibility, some program doesn't have role added. -// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; -// int role_id = -// boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); -// visit[role_id] = true; -// switch (role_id) { -// case _INT(OpRole::kForward): -// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { -// LOG(ERROR) -// << "Cannot add backward operator before forward operator %s." -// << op->Type(); -// } -// break; -// case _INT(OpRole::kBackward): -// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): -// PADDLE_ENFORCE( -// visit.find(_INT(OpRole::kOptimize)) == visit.end(), -// "Cannot add backward operator %s after optimize operator.", -// op->Type()); -// break; -// case _INT(OpRole::kForward) | _INT(OpRole::kLoss): -// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | -// _INT(OpRole::kLoss)) == visit.end(), -// "Cannot add backward|loss operator before " -// "forward|loss operator %s.", -// op->Type()); -// PADDLE_ENFORCE( -// visit.find(_INT(OpRole::kOptimize)) == visit.end(), -// "Cannot add forward|loss operator %s after optimize operator.", -// op->Type()); -// break; -// case _INT(OpRole::kOptimize): -// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): -// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), -// "Optimize operators %s must follow backward operator.", -// op->Type()); -// break; -// case _INT(OpRole::kLRSched): -// case _INT(OpRole::kDist): -// case _INT(OpRole::kRPC): -// case _INT(OpRole::kNotSpecified): -// break; -// default: -// LOG(FATAL) << "Unknown operator role. Don't add new role because " -// "you don't know what you are doing."; -// } -// } + std::map visit; + for (OpDesc *op : program.Block(0).AllOps()) { + // For backward compatibility, some program doesn't have role added. + if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; + int role_id = + boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + visit[role_id] = true; + switch (role_id) { + case _INT(OpRole::kForward): + if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { + LOG(ERROR) + << "Cannot add backward operator before forward operator %s." + << op->Type(); + } + break; + case _INT(OpRole::kBackward): + case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator %s after optimize operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { + LOG(ERROR) + << "Cannot add backward operator %s after optimize operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kForward) | _INT(OpRole::kLoss): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | + _INT(OpRole::kLoss)) == visit.end(), + "Cannot add backward|loss operator before " + "forward|loss operator %s.", + op->Type()); + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add forward|loss operator %s after optimize operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) != + visit.end()) { + LOG(ERROR) << "Cannot add backward|loss operator before " + << "forward|loss operator %s." << op->Type(); + } + + if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { + LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " + "operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kOptimize): + case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), + "Optimize operators %s must follow backward operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { + LOG(ERROR) + << "Optimize operators %s must follow backward operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kLRSched): + case _INT(OpRole::kDist): + case _INT(OpRole::kRPC): + case _INT(OpRole::kNotSpecified): + break; + default: + LOG(FATAL) << "Unknown operator role. Don't add new role because " + "you don't know what you are doing."; + } + } #undef _INT } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b8adce4edf..c6f3254e9f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -701,125 +701,85 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } -struct RecordTime { - RecordTime(const std::string& name, const std::string& type) - : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} - - void inline stop() { - end_ = std::chrono::system_clock::now(); - std::chrono::duration diff = end_ - start_; - VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); - } - - ~RecordTime() { - if (type_ == "elementwise_add") { - stop(); - } - // stop(); - } - - std::string name_; - std::string type_; - std::chrono::system_clock::time_point start_; - std::chrono::system_clock::time_point end_; -}; - void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RecordTime rt("OperatorWithKernel::All", type_); - { - RecordTime rt("OperatorWithKernel::InferShape", type_); - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); - } - - { - RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", - type_); - } + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", type_); + } - OpKernelMap& kernels = kernels_iter->second; + OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - // for (auto& candidate : kKernelPriority) { - // Do selection - // } + // for (auto& candidate : kKernelPriority) { + // Do selection + // } - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; - auto kernel_iter = kernels.find(expected_kernel_key); + auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } #endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } - // do data transformScope &transfer_scope; - std::vector transfered_inplace_vars; - Scope* transfer_scope = nullptr; - // auto* transfer_scope = - // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + auto* transfer_scope = + TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - // exec scope is the scope that kernel actually executed on. - const Scope& exec_scope = scope; - // const Scope& exec_scope = - // (transfer_scope == nullptr ? scope : *transfer_scope); + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = + (transfer_scope == nullptr ? scope : *transfer_scope); - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } - delete rt_1; + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } - RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_); - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - delete rt_2; + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_); - if (!transfered_inplace_vars.empty()) { - // there is inplace variable has been transfered. - TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); - } + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } - /*For profiling/benchmark only*/ - if (FLAGS_benchmark) { - dev_ctx->Wait(); - } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } - if (FLAGS_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(vname, - var->Get().value()); - } + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get().value()); } } - delete rt_3; } } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 181baac870..87bf7c6b15 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -33,37 +33,34 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - if (!ctx->IsRuntime()) { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); - - PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the " - "received is %s [%s]", - ctx->GetInputsVarType("Y").front(), - ctx->Inputs("Y").front()); - - if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { - PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && - (ctx->GetInputDim("Y")[0] == 1), - "For elementwise_op, if X is Sparse, " - "Y must be scalar."); - } else { - PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - ctx->GetInputsVarType("X").front()); - } + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + PADDLE_ENFORCE( + ctx->GetInputsVarType("Y").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s [%s]", + ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); } ctx->ShareDim("X", /*->*/ "Out"); @@ -128,7 +125,7 @@ The equation is: $$%s$$ -- $X$: a tensor of any dimension. +- $X$: a tensor of any dimension. - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: @@ -138,10 +135,10 @@ There are two cases for this operator: For case 2: -1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index - for broadcasting $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. -3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape(Y) = (2, 1) => (2). For example: @@ -155,7 +152,7 @@ For example: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -The inputs $X$ and $Y$ can carry the different LoD information. +The inputs $X$ and $Y$ can carry the different LoD information. But the output only shares the LoD information with the input $X$. )DOC", diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index bc1b20321f..5710cda39a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -23,57 +23,56 @@ class AdamOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - // PADDLE_ENFORCE(ctx->HasInput("Param"), - // "Input(Param) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Grad"), - // "Input(Grad) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Moment1"), - // "Input(Moment1) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Moment2"), - // "Input(Moment2) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - // "Input(LearningRate) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - // "Input(Beta1Pow) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - // "Input(Beta2Pow) of AdamOp should not be null."); - - // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - // "Output(ParamOut) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - // "Output(Moment1Out) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - // "Output(Moment2Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment1"), + "Input(Moment1) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment2"), + "Input(Moment2) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + "Input(Beta2Pow) of AdamOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + "Output(Moment1Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); - // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - // "Learning rate should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, - // "Beta1 power accumulator should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, - // "Beta2 power accumulator should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + "Beta2 power accumulator should have 1 dimension"); auto param_dims = ctx->GetInputDim("Param"); - // if (ctx->GetInputsVarType("Grad")[0] == - // framework::proto::VarType::LOD_TENSOR) { - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Grad"), - // "Param and Grad input of AdamOp should have same dimension"); - // } - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Moment1"), - // "Param and Moment1 input of AdamOp should have same dimension"); - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Moment2"), - // "Param and Moment2 input of AdamOp should have same dimension"); + if (ctx->GetInputsVarType("Grad")[0] == + framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamOp should have same dimension"); + } + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment1"), + "Param and Moment1 input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment2"), + "Param and Moment2 input of AdamOp should have same dimension"); ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = From 1b61021cb36eae45e142a953c2c96cf46853aa7c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 17:02:24 +0800 Subject: [PATCH 257/719] Polish code --- paddle/fluid/framework/ir/graph.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 9ebf136698..db74d5674a 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -57,7 +57,7 @@ void CheckProgram(const ProgramDesc &program) { } else { if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { LOG(ERROR) - << "Cannot add backward operator %s after optimize operator.", + << "Cannot add backward operator %s after optimize operator." << op->Type(); } } @@ -82,8 +82,8 @@ void CheckProgram(const ProgramDesc &program) { if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " - "operator.", - << op->Type(); + "operator." + << op->Type(); } } break; @@ -95,9 +95,8 @@ void CheckProgram(const ProgramDesc &program) { op->Type()); } else { if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { - LOG(ERROR) - << "Optimize operators %s must follow backward operator.", - << op->Type(); + LOG(ERROR) << "Optimize operators %s must follow backward operator." + << op->Type(); } } break; From e82772f42518f1cff790ac04aa1c73c2e5b201e9 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 09:22:44 +0000 Subject: [PATCH 258/719] fix cmake conflict test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b1cfb23f3a..6d7a69c8c9 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -169,7 +169,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() From 8d9401152eaf26cc0d6ab4643fe6255028d6edf2 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 12 Dec 2018 17:43:39 +0800 Subject: [PATCH 259/719] Refine w2v --- .../fluid/operators/math/matrix_bit_code.cc | 22 ++++++---- paddle/fluid/operators/math/matrix_bit_code.h | 40 +++++++++---------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 5a6e64b6f8..dbf4f5e332 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -23,12 +23,14 @@ void MatrixBitCodeFunctor::Add(const framework::Tensor& vec, framework::Tensor* tmat) { size_t batch_size = tmat->dims()[0]; size_t width = tmat->dims()[1]; + auto* tmat_data = tmat->data(); + auto* vec_data = vec.data(); for (size_t i = 0; i < batch_size; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); - tmat->data()[i * width + j] += vec.data()[index]; + tmat_data[i * width + j] += vec_data[index]; } } } @@ -38,12 +40,14 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, framework::Tensor* vec) { size_t batch_size = tmat.dims()[0]; size_t width = tmat.dims()[1]; + auto* vec_data = vec->data(); + auto* tmat_data = tmat.data(); for (size_t i = 0; i < batch_size; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); - vec->data()[index] += tmat.data()[i * width + j]; + vec_data[index] += tmat_data[i * width + j]; } } } @@ -53,14 +57,15 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec) { size_t batch_size = tmat.dims()[0]; size_t width = tmat.dims()[1]; + auto* vec_data = vec->mutable_value()->data(); + auto* tmat_data = tmat.data(); for (size_t i = 0; i < batch_size; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec->mutable_value()->data()[row_index] += - tmat.data()[i * width + j]; + vec_data[row_index] += tmat_data[i * width + j]; } } } @@ -70,6 +75,8 @@ void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum) { size_t num_samples = tmat.dims()[0]; size_t o_width = tmat.dims()[1]; + auto* tmat_data = tmat.data(); + auto* sum_data = sum->data(); for (size_t i = 0; i < num_samples; ++i) { T sm = static_cast(0.0); auto code = code_table_->get_code(i); @@ -78,10 +85,10 @@ void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, if (code->calc_bit(j)) { // calc_bit starts from right most bit, while data in tmat[i] is in the // reverse order. - sm += tmat.data()[i * o_width + j]; + sm += tmat_data[i * o_width + j]; } } - sum->data()[i] = scale_sum * sm; + sum_data[i] = scale_sum * sm; } } @@ -217,12 +224,13 @@ template void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { size_t num_samples = tmat->dims()[0]; size_t o_width = tmat->dims()[1]; + auto* tmat_data = tmat->data(); for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { if (code->calc_bit(j)) { - tmat->data()[i * o_width + j] -= 1; + tmat_data[i * o_width + j] -= 1; } } } diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 35ca73802b..ba1745b86d 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -140,13 +140,13 @@ template class CustomCode : public Code { public: CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) - : ids_(ids), index_(index) { - ptable_ = ptable.Slice(index, index + 1); - pcode_ = pcode.Slice(index, index + 1); + const int64_t* ids, int index) { + seq_len_ = ptable.dims()[1]; + ptable_data_ = ptable.data() + seq_len_ * index; + pcode_data_ = pcode.data() + seq_len_ * index; } /** - * Here the id of root shoud be 1 rather than 0, thus the encoding of class c + * Here the id of root should be 1 rather than 0, thus the encoding of class c * is `c + num_classes` and all siblings can get the same weight indice using * prefixes. * Weight index is the prefixes of encoding, thus leave out the right most @@ -154,26 +154,26 @@ class CustomCode : public Code { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_.data()[bit]; } - bool calc_bit(int bit) const { return pcode_.data()[bit]; } - int get_length() const { - int length = 0; + size_t calc_index(int bit) const override { return ptable_data_[bit]; } + bool calc_bit(int bit) const override { return pcode_data_[bit]; } - for (int i = 0; i < static_cast(ptable_.dims()[1]); i++) { - if (ptable_.data()[i] >= 0) { - length++; - } else { - return length; - } + // NOTE: this function is not thread-safe. + int get_length() const override { + if (length_ < 0) { + auto len = seq_len_; + length_ = + static_cast(std::find_if(ptable_data_, ptable_data_ + len, + [](const T& val) { return val < 0; }) - + ptable_data_); } - return length; + return length_; } private: - framework::Tensor ptable_; - framework::Tensor pcode_; - const int64_t* ids_; - const int index_; + int64_t seq_len_; + const T* ptable_data_; + const T* pcode_data_; + mutable int length_{-1}; }; class SimpleCodeTable : public CodeTable { From be113756610c2894ae2adfeab40c8dfe879620a9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 12 Dec 2018 18:06:58 +0800 Subject: [PATCH 260/719] Refine code --- .../fluid/operators/math/matrix_bit_code.cc | 4 +-- paddle/fluid/operators/math/matrix_bit_code.h | 30 ++++++++++++++----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index dbf4f5e332..92affa0e4e 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include +#include namespace paddle { namespace operators { namespace math { @@ -133,8 +134,7 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto weight_value = weight->data(); auto input_value = input.data(); - std::unordered_map>> ops; - + std::map>> ops; for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index ba1745b86d..cf43ad9d44 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include @@ -109,7 +110,7 @@ class Code { // set a CodeTable interface to create multiple code table class CodeTable { public: - virtual std::unique_ptr get_code(int64_t code) const = 0; + virtual Code* get_code(int64_t code) const = 0; virtual size_t size() const = 0; virtual int get_max_code_length() const = 0; virtual ~CodeTable() {} @@ -180,14 +181,23 @@ class SimpleCodeTable : public CodeTable { public: SimpleCodeTable(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids) {} - std::unique_ptr get_code(int64_t code) const { - std::unique_ptr coder(new SimpleCode(code, num_classes_, ids_)); - return coder; + + Code* get_code(int64_t code) const { + auto it = codes_.find(code); + if (it != codes_.end()) { + return it->second.get(); + } + auto* result = new SimpleCode(code, num_classes_, ids_); + codes_.emplace(code, std::unique_ptr(result)); + return result; } + size_t size() const { return num_classes_; } int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } private: + mutable std::map> codes_; + size_t num_classes_; const int64_t* ids_; }; @@ -199,9 +209,14 @@ class CustomCodeTable : public CodeTable { const framework::Tensor& pcode, const int64_t* ids) : ptable_(ptable), pcode_(pcode), ids_(ids) {} - std::unique_ptr get_code(int64_t code) const { - std::unique_ptr coder(new CustomCode(ptable_, pcode_, ids_, code)); - return coder; + Code* get_code(int64_t code) const { + auto it = codes_.find(code); + if (it != codes_.end()) { + return it->second.get(); + } + auto* result = new CustomCode(ptable_, pcode_, ids_, code); + codes_.emplace(code, std::unique_ptr(result)); + return result; } size_t size() const { return static_cast(ptable_.dims()[1]); } @@ -210,6 +225,7 @@ class CustomCodeTable : public CodeTable { } private: + mutable std::unordered_map> codes_; const framework::Tensor& ptable_; const framework::Tensor& pcode_; const int64_t* ids_; From a61eb543f5796d9899bff073e5f6647bc1003d71 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 19:18:45 +0800 Subject: [PATCH 261/719] Add RWLock to Scope --- paddle/fluid/framework/rw_lock.h | 16 ++++++++++++---- paddle/fluid/framework/scope.cc | 11 ++++------- paddle/fluid/framework/scope.h | 4 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index dbf00f3a79..dd918fcdfa 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -16,7 +16,9 @@ limitations under the License. */ #if !defined(_WIN32) #include -#endif // !_WIN32 +#else +#include // NOLINT +#endif // !_WIN32 #include "paddle/fluid/platform/enforce.h" @@ -51,9 +53,15 @@ struct RWLock { // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { - void RDLock() {} - void WRLock() {} - void UNLock() {} + // FIXME(minqiyang): use mutex here to do fake lock + void RDLock() { mutex_.lock(); } + + void WRLock() { mutex_.lock(); } + + void UNLock() { mutex_.unlock(); } + + private: + std::mutex mutex_; }; #endif diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 61416676d6..190a057d9e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -46,13 +46,10 @@ DEFINE_double( #define SCOPE_READER_LOCK #define SCOPE_WRITER_LOCK #else -// TODO(minqiyang): use reader lock and writer lock in all platforms -#define SCOPE_READER_LOCK -#define SCOPE_WRITER_LOCK -// #define SCOPE_READER_LOCK boost::shared_lock -// lock(mutex_); -// #define SCOPE_WRITER_LOCK boost::unique_lock -// lock(mutex_); +// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one +// in _WIN32 platform +#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock); +#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock); #endif namespace paddle { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 1901ffbe57..c140212c3e 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -15,11 +15,11 @@ limitations under the License. */ #pragma once #include -#include // NOLINT #include #include #include +#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -123,7 +123,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable std::mutex mutex_; + mutable RWLock rw_lock_; }; // Generate some debug string about the inherience structure of scope, quite From 2c1e986f22c7535ffd420d9370f79cf93bd5bf25 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 19:21:31 +0800 Subject: [PATCH 262/719] barrier_all to barrier_worker --- python/paddle/fluid/async_executor.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index b077e1be7e..af42d2912f 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -170,7 +170,8 @@ class AsyncExecutor(object): self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, multi_processes=process_num) - self.instance.barrier_all() #wait for download_data #TODO only barriere worker + #self.instance.barrier_all() #wait for download_data #TODO only barriere worker + self.instance.barrier_worker() #wait for download_data #TODO only barriere worker def config_distributed_nodes(self): self.instance = ps_instance.PaddlePSInstance(1, 2) @@ -187,13 +188,13 @@ class AsyncExecutor(object): raise ValueError('instance is None, please run config_distributed_nodes init instance') return self.instance - def stop_server(self): + def stop(self): if self.instance is None: raise ValueError('instance is None, please run config_distributed_nodes init instance') - self.instance.barrier_all() #worker do all things + self.instance.barrier_worker() #worker do all things if self.instance.is_first_worker(): self.executor.stop_server() - self.instance.barrier_all() #sync + self.instance.barrier_worker() #sync def init_server(self, dist_desc): if self.instance is None: @@ -205,10 +206,6 @@ class AsyncExecutor(object): ips = self.instance.gather_ips() self.executor.gather_servers(ips, self.instance.get_node_cnt()) self.instance.barrier_all() #wait all worker start - self.instance.barrier_all() #wait init model - self.instance.barrier_all() #wait for download_data #TODO remove this after only barrier worker - self.instance.barrier_all() #wait worker do all things - self.instance.barrier_all() #sync def init_worker(self, dist_desc, startup_program): if self.instance is None: @@ -223,7 +220,7 @@ class AsyncExecutor(object): self.instance.barrier_all() #wait all worker start if self.instance.is_first_worker(): self.executor.init_model() - self.instance.barrier_all() #wait init model + self.instance.barrier_worker() #wait init model def init_model(self): if self.instance is None: From c2e851f7b284ad122d20b932ff2df165d56b7994 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 12 Dec 2018 11:42:16 +0000 Subject: [PATCH 263/719] test=develop, remove sparse bias and add prefetch and related tests --- .../distributed/parameter_prefetch.cc | 12 +- .../distributed/parameter_prefetch.h | 24 ++ .../operators/hierarchical_sigmoid_op.cc | 47 ++- .../fluid/operators/hierarchical_sigmoid_op.h | 83 ++++-- .../fluid/operators/math/matrix_bit_code.cc | 17 -- paddle/fluid/operators/math/matrix_bit_code.h | 27 +- python/paddle/fluid/layers/nn.py | 17 +- .../fluid/tests/unittests/test_hsigmoid_op.py | 6 +- .../test_hsigmoid_remote_table_op.py | 271 ++++++++++++++++++ 9 files changed, 418 insertions(+), 86 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index aebf6376d1..52085482f4 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -32,7 +32,7 @@ namespace paddle { namespace operators { namespace distributed { -using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; @@ -120,8 +120,8 @@ static void MergeMultipleVarsIntoOneBySection( PADDLE_ENFORCE_GT( out_tensor->numel(), 0, - "When calling this method, the Tensor's numel must larger than zero. " - "Please check Tensor::Resize has been called first."); + "When calling this method, the LoDTensor's numel must larger than zero. " + "Please check LoDTensor::Resize has been called first."); auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); @@ -144,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection( auto row_numel = dims[1]; - for (size_t i = 0; i < dims[0]; ++i) { + for (int64_t i = 0; i < dims[0]; ++i) { auto id = ids_in_this_section[i]; auto origin_id = id + abs_sections[section_idx]; auto& offsets = id_to_offset[origin_id]; @@ -201,7 +201,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); - for (size_t i = 0; i < id_tensor.numel(); ++i) { + for (int64_t i = 0; i < id_tensor.numel(); ++i) { ids_vector.push_back(id_data[i]); } } else { @@ -209,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, PADDLE_THROW("paddle is not compiled with CUDA!"); #else auto cpu_place = platform::CPUPlace(); - framework::Tensor cpu_tensor; + framework::LoDTensor cpu_tensor; auto* cpu_tensor_data = cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); auto stream = diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53482c4c40..882c6bd9b8 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -30,6 +30,30 @@ void prefetch(const std::string& id_name, const std::string& out_name, const framework::ExecutionContext& context, const framework::Scope& scope); +template +void prefetch_with_reconstruct(const std::string& id_name, + const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, + framework::LoDTensor* original) { + prefetch(id_name, out_name, table_names, epmap, height_sections, context, + scope); + auto& out = scope.FindVar(out_name)->Get(); + auto& ids = scope.FindVar(id_name)->Get(); + auto* original_value = original->data(); + auto* out_value = out.data(); + size_t original_width = original->numel() / original->dims()[0]; + + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = original_value + original_width * ids.data()[i]; + std::memcpy(original_row, out_rows, original_width * sizeof(T)); + } +} + }; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 0dbcc442df..b9059f6b05 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("PreOut"), "Output(PreOut) should not be null."); + auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); + if (with_prefetch) { + PADDLE_ENFORCE(ctx->HasOutput("W_Out"), + "Output(W_Out) should not be null."); + } const int64_t batch_size = ctx->GetInputDim("X")[0]; std::vector output_shape({batch_size, 1}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); @@ -96,7 +101,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "(LoDTensor, required), The labels of training data. It's a" "tensor with shape [N, 1]."); - AddInput("PTable", + AddInput("PathTable", "(LoDTensor, optional), The Path Table from root to current word" "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); @@ -120,8 +125,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "[batch_size, code_length], where code_length represents the " "maximum path length from root to leaf nodes.") .AsIntermediate(); + AddOutput( + "W_Out", + "(LoDTensor, optinal) using input 'W' as Output to make it mutable" + "When we are using prefetch") + .AsIntermediate(); AddAttr("num_classes", "(int, optional), The number of classes") .SetDefault(2); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); AddComment(R"DOC( The hierarchical sigmoid operator organize the classes into a binary tree. At each node, a sigmoid function is used to calculate the probability of @@ -191,23 +218,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference << " is set to SelectedRows"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to SelectedRows"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); - } } else { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::LOD_TENSOR); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to LoDTensor"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::LOD_TENSOR); - } + } + if (hasBias) { + VLOG(30) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to LoDTensor"; + block->Var(bias_grad_var_name) + ->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); } diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b73a32af89..d8e406a96b 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/platform/transform.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -49,13 +55,55 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); auto& label = detail::Ref(ctx.Input("Label")); auto* bias = ctx.Input("Bias"); auto* out = ctx.Output("Out"); auto* pre_out = ctx.Output("PreOut"); size_t num_classes = static_cast(ctx.Attr("num_classes")); + // for remote prefetch + + auto epmap = ctx.Attr>("epmap"); + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + auto height_sections = ctx.Attr>("height_sections"); + auto table_names = ctx.Attr>("table_names"); + VLOG(3) << "path type is " << path->type().name(); + std::vector real_rows = PathToRows(*path); + framework::Scope& local_scope = ctx.scope().NewScope(); + auto* ids = local_scope.Var("Ids@Prefetch"); + auto* x_tensor = ids->GetMutable(); + + x_tensor->mutable_data( + framework::make_ddim({static_cast(real_rows.size()), 1}), + ctx.GetPlace()); + // copy. + + std::memcpy(x_tensor->data(), real_rows.data(), + real_rows.size() * sizeof(int64_t)); + + framework::DDim w_dims = ctx.Input("W")->dims(); + w_dims[0] = x_tensor->dims()[0]; + auto* w_tensor = + local_scope.Var("W@Prefetch")->GetMutable(); + w_tensor->Resize(w_dims); + +#ifdef PADDLE_WITH_DISTRIBUTE + // w_Out is set to used by prefetch, never change it in other cases + auto* w_out = ctx.Output("W_Out"); + operators::distributed::prefetch_with_reconstruct( + "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections, + ctx, local_scope, w_out); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } + bool is_custom = false; if (path) { is_custom = true; @@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); - auto* bias = ctx.Input("Bias"); auto* in_grad = ctx.Output(framework::GradVarName("X")); bool is_sparse = ctx.Attr("is_sparse"); @@ -165,15 +212,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { pre_out_grad_mat * out_grad_mat.broadcast(bcast); // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. - + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + if (bias_grad) { + bias_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } if (!is_sparse) { - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } auto* w_grad = ctx.Output(framework::GradVarName("W")); w_grad->mutable_data(ctx.GetPlace()); @@ -192,21 +238,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); zero(dev_ctx, w_grad_value, static_cast(0.0)); - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->set_rows(real_rows); - // build ids -> rows index map - bias_grad->SyncIndex(); - bias_grad->set_height(bias->dims()[0]); - auto* bias_grad_value = bias_grad->mutable_value(); - std::vector dims = {static_cast(real_rows.size()), - bias->dims()[1]}; - bias_grad_value->mutable_data(framework::make_ddim(dims), - ctx.GetPlace()); - zero(dev_ctx, bias_grad_value, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } bit_code->MulGradWeight(pre_out_grad, w_grad, in); } bit_code->MulGradError(pre_out_grad, w, in_grad); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 5a6e64b6f8..fed4639b01 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -48,23 +48,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, } } -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::SelectedRows* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec->mutable_value()->data()[row_index] += - tmat.data()[i * width + j]; - } - } -} - template void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum) { diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 35ca73802b..0bc09bdb35 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -139,11 +139,11 @@ class SimpleCode : public Code { template class CustomCode : public Code { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, int index) : ids_(ids), index_(index) { - ptable_ = ptable.Slice(index, index + 1); - pcode_ = pcode.Slice(index, index + 1); + ptable_ = path_table.Slice(index, index + 1); + pcode_ = path_code.Slice(index, index + 1); } /** * Here the id of root shoud be 1 rather than 0, thus the encoding of class c @@ -195,9 +195,9 @@ class SimpleCodeTable : public CodeTable { template class CustomCodeTable : public CodeTable { public: - CustomCodeTable(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : ptable_(ptable), pcode_(pcode), ids_(ids) {} + CustomCodeTable(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : ptable_(path_table), pcode_(path_code), ids_(ids) {} std::unique_ptr get_code(int64_t code) const { std::unique_ptr coder(new CustomCode(ptable_, pcode_, ids_, code)); @@ -223,11 +223,11 @@ class MatrixBitCodeFunctor { ids_(ids), code_table_(new SimpleCodeTable(num_classes, ids)) {} - MatrixBitCodeFunctor(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : num_classes_(static_cast(ptable.dims()[1])), + MatrixBitCodeFunctor(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(new CustomCodeTable(ptable, pcode, ids)) {} + code_table_(new CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -238,11 +238,6 @@ class MatrixBitCodeFunctor { */ void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); - /* For selected rows For j < code_length - vec(0, index(i, j)) += tmat(i, j) - */ - void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec); - /* For j < code_length sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) */ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 37ddfdf7d5..38dad85717 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4931,6 +4931,9 @@ def hsigmoid(input, pass weights = None + remote_prefetch = False + if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): + remote_prefetch = True if not is_custom: weights = helper.create_parameter( @@ -4947,7 +4950,7 @@ def hsigmoid(input, inputs = { "X": input, "W": weights, - "PTable": path_table, + "PathTable": path_table, "PathCode": path_code, "Label": label } @@ -4970,9 +4973,13 @@ def hsigmoid(input, type="hierarchical_sigmoid", inputs=inputs, outputs={"Out": out, - "PreOut": pre_out}, - attrs={"num_classes": num_classes, - "is_sparse": is_sparse}) + "PreOut": pre_out, + "W_Out": weights}, + attrs={ + "num_classes": num_classes, + "is_sparse": is_sparse, + "remote_prefetch": remote_prefetch + }) return out @@ -7440,7 +7447,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): Examples: - .. code-block:: python + .. code-block:: python x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 2a6c93f75f..8ed5074dc2 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, } diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py new file mode 100644 index 0000000000..9ed6c94bd2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py @@ -0,0 +1,271 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_hsigmoid_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_hsigmoid_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_hsigmoid_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self._run_hsigmoid_op_one_pserver(place, port0) + self._run_hsigmoid_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() From 5d3ecbfdf503965cc66eda6f8c75849ae0546c1e Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 19:48:30 +0800 Subject: [PATCH 264/719] fix hdfs bug --- python/paddle/fluid/contrib/utils/hdfs_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index 251665d85e..ff1a2d3e4a 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -52,9 +52,10 @@ class HDFSClient(object): ret_code = 0 ret_out = None ret_err = None + whole_commands = " ".join(whole_commands) for x in range(retry_times + 1): proc = subprocess.Popen( - whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) (output, errors) = proc.communicate() ret_code, ret_out, ret_err = proc.returncode, output, errors if ret_code: From 106e28523641ef6bdffe301b2a63b6d0f13de29a Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 12 Dec 2018 19:57:49 +0800 Subject: [PATCH 265/719] add unittest for parllelgraph mode test=develop --- .../details/multi_devices_graph_pass.cc | 8 +- .../details/parallel_ssa_graph_executor.cc | 20 +-- paddle/fluid/framework/parallel_executor.cc | 2 +- paddle/fluid/operators/reader/ctr_reader.h | 2 +- .../unittests/parallel_executor_test_base.py | 164 +++++++++--------- .../unittests/test_parallel_executor_crf.py | 3 + .../unittests/test_parallel_executor_mnist.py | 38 ++-- .../test_parallel_executor_seresnext.py | 49 ++++-- .../test_parallel_executor_transformer.py | 6 +- 9 files changed, 164 insertions(+), 128 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index c16e3006d7..e264906b57 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -300,7 +300,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - // int num_trainers = Get(kNumTrainers); + int num_trainers = Get(kNumTrainers); for (auto &node : nodes) { if (node->IsVar() && node->Var()) { @@ -387,7 +387,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { - if (!is_forwarding && nccl_ctxs_->contexts_.size() > 1) { + // insert synchronous ops at the backpropagation; and + // insert synchronous ops if the graph contains mutilple places. + if (!is_forwarding && + (places_.size() > 1 || num_trainers > 1 || + (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index f1a07edf08..214c2f7625 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -49,18 +49,18 @@ FeedFetchList ParallelSSAGraphExecutor::Run( for (size_t i = 0; i < places_.size(); ++i) { auto call = [this, i, &fetch_tensors]() -> FeedFetchList { - return executors_[i]->Run(fetch_tensors); + try { + return executors_[i]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + return FeedFetchList(); }; if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - try { - fetch_datas.emplace_back(std::move(call())); - } catch (...) { - exception_holder_.Catch(std::current_exception()); - break; - } + call(); } } @@ -69,11 +69,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (exception_holder_.IsCaught()) { f.wait(); } else { - try { - fetch_datas.emplace_back(std::move(f.get())); - } catch (...) { - exception_holder_.Catch(std::current_exception()); - } + fetch_datas.emplace_back(std::move(f.get())); } } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b0cd1e8e90..8d35361eb6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -87,7 +87,7 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, "You should set build_strategy.reduce with 'AllReduce' for " - "ParallelGraph executor type"); + "the ParallelGraph executor type"); } // Step 1. Bcast the params to devs. diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae1..517d669744 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -48,7 +48,7 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, + int batch_size, size_t thread_num, const std::vector& slots, const std::vector& file_list) : batch_size_(batch_size), slots_(slots), file_list_(file_list) { diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 86f861674c..73b8fb74fa 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -26,23 +26,26 @@ import sys __all__ = ['TestParallelExecutorBase'] +ExecutorType = fluid.ExecutionStrategy().ExecutorType + class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence(self, - method, - use_cuda=True, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - use_reduce=False, - fuse_elewise_add_act_ops=False, - optimizer=fluid.optimizer.Adam, - use_fast_executor=False, - enable_sequential_execution=False): + def check_network_convergence( + self, + method, + use_cuda=True, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + use_reduce=False, + fuse_elewise_add_act_ops=False, + optimizer=fluid.optimizer.Adam, + exec_type=fluid.ExecutionStrategy().ExecutorType.Default, + enable_sequential_execution=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -58,68 +61,69 @@ class TestParallelExecutorBase(unittest.TestCase): startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 - with fluid.program_guard(main, startup): - if seed is not None: - startup.random_seed = seed - main.random_seed = seed - - loss = method(use_feed=feed_dict is not None) - - optimizer().minimize(loss) - - if memory_opt: - fluid.memory_optimize(main) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - if use_fast_executor: - exec_strategy.use_experimental_executor = True - - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ - if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - - if use_parallel_executor: - exe = fluid.ParallelExecutor( - use_cuda, - loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) - else: - exe = fluid.Executor(place=place) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - begin = time.time() - first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - - for i in range(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print("%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin))) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val)): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss + scope = fluid.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(main, startup): + if seed is not None: + startup.random_seed = seed + main.random_seed = seed + + loss = method(use_feed=feed_dict is not None) + + optimizer().minimize(loss) + + if memory_opt: + fluid.memory_optimize(main) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + exec_strategy.executor_type = exec_type + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.enable_sequential_execution = enable_sequential_execution + if use_cuda and core.is_compiled_with_cuda(): + build_strategy.remove_unnecessary_lock = True + + if use_parallel_executor: + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + else: + exe = fluid.Executor(place=place) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + begin = time.time() + first_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + + for i in range(iter): + run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print("%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin))) + + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + + print(first_loss, last_loss) + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 84b0aad8ac..d75761153c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -181,6 +181,9 @@ class TestCRFModel(unittest.TestCase): if core.is_compiled_with_cuda(): self.check_network_convergence( is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( is_sparse=True, build_strategy=build_strategy, use_cuda=False) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3eecc46701..3dddff0d99 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -20,7 +20,7 @@ import numpy as np import paddle.fluid.core as core import os import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType def simple_fc_net(use_feed): @@ -99,7 +99,10 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) # simple_fc - def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + def check_simple_fc_convergence(self, + use_cuda, + use_reduce=False, + exec_type=ExecutorType.Default): if use_cuda and not core.is_compiled_with_cuda(): return @@ -110,19 +113,21 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_reduce=use_reduce) + use_reduce=use_reduce, + exec_type=exec_type) def test_simple_fc(self): # use_cuda - self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(True, ExecutorType.Default) + self.check_simple_fc_convergence(True, ExecutorType.ParallelGraph) self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reduce + # use_cuda, use_reducea self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, use_cuda): + def check_simple_fc_parallel_accuracy(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return @@ -134,14 +139,16 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=False) + use_parallel_executor=False, + exec_type=exec_type) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=True) + use_parallel_executor=True, + exec_type=exec_type) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -151,10 +158,12 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True) - self.check_simple_fc_parallel_accuracy(False) + self.check_simple_fc_parallel_accuracy(True, ExecutorType.Default) + self.check_simple_fc_parallel_accuracy(True, ExecutorType.ParallelGraph) + # FIXME(Yancey1989): ParallelGraph executor type support CPU mode + self.check_simple_fc_parallel_accuracy(False, ExecutorType.Default) - def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): + def check_batchnorm_fc_convergence(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return @@ -165,12 +174,13 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_fast_executor=use_fast_executor) + exec_type=exec_type) def test_batchnorm_fc(self): for use_cuda in (False, True): - for use_fast_executor in (False, True): - self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) + for exec_type in (ExecutorType.Default, ExecutorType.Experimental, + ExecutorType.ParallelGraph): + self.check_batchnorm_fc_convergence(use_cuda, exec_type) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index e7a56bb638..bada38894f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -19,7 +19,7 @@ import paddle.fluid.layers.ops as ops from paddle.fluid.initializer import init_on_cpu from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter import paddle.fluid.core as core -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType import unittest import math import os @@ -167,13 +167,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs=120): return decayed_lr -def optimizer(learning_rate=0.01): - optimizer = fluid.optimizer.Momentum( - learning_rate=cosine_decay( - learning_rate=learning_rate, step_each_epoch=2, epochs=1), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - return optimizer +def optimizer(learning_rate=0.01, lr_scale=1.0): + def _opt(): + return fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate / lr_scale, + step_each_epoch=2, + epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + return _opt class TestResnet(TestParallelExecutorBase): @@ -216,7 +220,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer) + optimizer=optimizer()) reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -225,7 +229,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer) + optimizer=optimizer()) for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) @@ -243,7 +247,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=False, - optimizer=optimizer, + optimizer=optimizer(), enable_sequential_execution=True) reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( @@ -254,7 +258,7 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=True, - optimizer=optimizer, + optimizer=optimizer(), enable_sequential_execution=True) for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): @@ -277,7 +281,9 @@ class TestResnet(TestParallelExecutorBase): use_cuda=True, use_reduce=False, iter=20, - delta2=1e-6): + delta2=1e-6, + exec_type=ExecutorType.Default, + lr_scale=1.0): if use_cuda and not core.is_compiled_with_cuda(): return @@ -295,8 +301,9 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer, - use_parallel_executor=False) + optimizer=optimizer(), + use_parallel_executor=False, + exec_type=exec_type) parallel_first_loss, parallel_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -305,7 +312,8 @@ class TestResnet(TestParallelExecutorBase): batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=optimizer) + optimizer=optimizer(lr_scale=lr_scale), + exec_type=exec_type) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) @@ -313,7 +321,14 @@ class TestResnet(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) def test_seresnext_with_learning_rate_decay(self): - self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) + if core.is_compiled_with_cuda(): + self._check_resnet_convergence( + model=SE_ResNeXt50Small, use_cuda=True) + self._check_resnet_convergence( + model=SE_ResNeXt50Small, + use_cuda=True, + exec_type=ExecutorType.ParallelGraph, + lr_scale=core.get_cuda_device_count()) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 3827743908..b5ee72a24e 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -17,7 +17,7 @@ from __future__ import print_function import paddle.fluid as fluid import transformer_model import numpy as np -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType import unittest import paddle import paddle.fluid.core as core @@ -173,6 +173,10 @@ class TestTransformer(TestParallelExecutorBase): def test_main(self): if core.is_compiled_with_cuda(): self.check_network_convergence(transformer, use_cuda=True) + self.check_network_convergence( + transformer, + use_cuda=True, + exec_type=ExecutorType.ParallelGraph) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From 417d031f90162737ab40978773a325829b72c1a3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 09:53:50 +0000 Subject: [PATCH 266/719] add refer vadd, vaddrelu, vsub and tests and benchmark --- paddle/fluid/operators/jit/README.md | 6 +- paddle/fluid/operators/jit/benchmark.cc | 66 +++++++++--------- paddle/fluid/operators/jit/helper.cc | 43 ++++++++++++ paddle/fluid/operators/jit/helper.h | 2 + paddle/fluid/operators/jit/kernel_base.h | 4 +- paddle/fluid/operators/jit/more/mkl/mkl.h | 2 +- paddle/fluid/operators/jit/refer/refer.cc | 12 +++- paddle/fluid/operators/jit/refer/refer.h | 47 +++++++++++-- paddle/fluid/operators/jit/test.cc | 67 +++++++++++++------ .../fluid/operators/math/jit_kernel_refer.h | 30 --------- 10 files changed, 186 insertions(+), 93 deletions(-) create mode 100644 paddle/fluid/operators/jit/helper.cc diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 12158bf9d0..c2e32cc49b 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -41,6 +41,6 @@ PaddlePaddle/Paddle/paddle/fluid/ - 性能测试 # 如何添加新的算子 -TBD -## Use me -Add USE_JIT_KERNEL(yourname) to CMakefile. + +- 在`KernelType` 中添加 `your_key` +- 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)` diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 5cc82b69f8..27a1ba7ba3 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -52,9 +52,10 @@ std::vector TestSizes() { } // return this function avg time -template -double BenchTartgetFunc(const Func tgt, const std::vector& x, - const std::vector& y, std::vector& z) { // NOLINT +template +double BenchTartgetFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + std::vector& z) { // NOLINT const T* x_data = x.data(); const T* y_data = y.data(); const int d = z.size(); @@ -71,40 +72,25 @@ double BenchTartgetFunc(const Func tgt, const std::vector& x, return (end - start) / FLAGS_repeat; } -// Benchmark all jit kernels including jitcode, mkl and refer. -// To use this tool, run command: ./benchmark [options...] -// Options: -// --burning: the burning time before count -// --repeat: the repeat times -// --max_size: the max size would be tested -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - google::InitGoogleLogging(argv[0]); - using T = float; - using PlaceType = paddle::platform::CPUPlace; +template +void BenchXYZNKernel() { namespace jit = paddle::operators::jit; - const auto KT = jit::vmul; - LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat - << " times."; for (int d : TestSizes()) { - // for (kernels type) { // TODO(TJ): more jit::KernelType std::vector> infos; std::vector x(d), y(d), z(d); RandomVec(d, x.data()); RandomVec(d, y.data()); // refer - auto refer = jit::GetRefer>(); + auto refer = jit::GetRefer>(); if (refer) { - auto res = - BenchTartgetFunc::func_type>(refer, x, y, z); + auto res = BenchTartgetFunc>(refer, x, y, z); infos.push_back(std::make_pair("Refer", res)); } // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { - auto res = - BenchTartgetFunc::func_type>(jitcode, x, y, z); + auto res = BenchTartgetFunc>(jitcode, x, y, z); infos.push_back(std::make_pair("JitCode", res)); } @@ -115,32 +101,50 @@ int main(int argc, char* argv[]) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast>*>( + auto i = dynamic_cast>*>( impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); - auto res = - BenchTartgetFunc::func_type>(more, x, y, z); + auto res = BenchTartgetFunc>(more, x, y, z); infos.push_back(std::make_pair("More", res)); } } } // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); + auto tgt = jit::Get, PlaceType>(d); if (!tgt) { LOG(ERROR) << "Target can not be empty!"; } - auto res = BenchTartgetFunc::func_type>(tgt, x, y, z); + auto res = BenchTartgetFunc>(tgt, x, y, z); infos.push_back(std::make_pair("Target", res)); // print std::ostringstream loginfos; - loginfos << "Kernel Type: " << KT << ", size " << d << ": "; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; for (auto pair : infos) { loginfos << pair.first << " takes " << pair.second << " us; "; } LOG(INFO) << loginfos.str(); - // } } } + +// Benchmark all jit kernels including jitcode, mkl and refer. +// To use this tool, run command: ./benchmark [options...] +// Options: +// --burning: the burning time before count +// --repeat: the repeat times +// --max_size: the max size would be tested +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat + << " times."; + using T = float; + using PlaceType = paddle::platform::CPUPlace; + namespace jit = paddle::operators::jit; + BenchXYZNKernel(); + BenchXYZNKernel(); + BenchXYZNKernel(); + BenchXYZNKernel(); +} diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc new file mode 100644 index 0000000000..2260f0aed4 --- /dev/null +++ b/paddle/fluid/operators/jit/helper.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/helper.h" + +namespace paddle { +namespace operators { +namespace jit { + +const char* to_string(KernelType kt) { + switch (kt) { + case vmul: + return "vmul"; + case vadd: + return "vadd"; + case vaddrelu: + return "vaddrelu"; + case vsub: + return "vsub"; + case vscal: + return "vscal"; + case vexp: + return "vexp"; + default: + return "NOT JITKernel"; + } + return nullptr; +} + +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d1bbe10381..124587b143 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -112,6 +112,8 @@ typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { return GetRefer(); } +const char* to_string(KernelType kt); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 84f0308898..b2e9d63977 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -19,10 +19,10 @@ namespace paddle { namespace operators { namespace jit { -typedef enum { vmul = 0, vadd = 1, vsub, vexp } KernelType; +typedef enum { vmul = 0, vadd = 1, vaddrelu, vsub, vscal, vexp } KernelType; template -struct VMulTuples { +struct XYZNTuples { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 56469b054d..4173d1f3de 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -28,7 +28,7 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl> { +class VMulKernel : public KernelImpl> { public: VMulKernel() { this->func = VMul; } bool UseMe(int d) const override { diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index a987b5fca0..69d039422f 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -17,5 +17,13 @@ namespace refer = paddle::operators::jit::refer; -REGISTER_JITKERNEL_REFER(vmul, refer::VMulKernel, - refer::VMulKernel); +#define REGISTER_REFER_KERNEL(key, func) \ + REGISTER_JITKERNEL_REFER(key, refer::func##Kernel, \ + refer::func##Kernel) + +REGISTER_REFER_KERNEL(vmul, VMul); +REGISTER_REFER_KERNEL(vadd, VAdd); +REGISTER_REFER_KERNEL(vaddrelu, VAddRelu); +REGISTER_REFER_KERNEL(vsub, VSub); + +#undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 99d1cbd43e..4d4d308cbd 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jit/helper.h" #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/enforce.h" @@ -21,6 +22,7 @@ namespace operators { namespace jit { namespace refer { +// Refer code only focus on correctness template void VMul(const T* x, const T* y, T* z, int n) { for (int i = 0; i < n; ++i) { @@ -29,10 +31,47 @@ void VMul(const T* x, const T* y, T* z, int n) { } template -class VMulKernel : public ReferKernel> { - public: - VMulKernel() { this->func = VMul; } -}; +void VAdd(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +template +void VAddRelu(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + z[i] = z[i] > 0 ? z[i] : 0; + } +} + +template +void VSub(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] - y[i]; + } +} + +template +void VScal(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] * x[i]; + } +} + +#define DECLARE_REFER_KERNEL(name, tuples) \ + template \ + class name##Kernel : public ReferKernel> { \ + public: \ + name##Kernel() { this->func = name; } \ + } + +DECLARE_REFER_KERNEL(VMul, XYZNTuples); +DECLARE_REFER_KERNEL(VAdd, XYZNTuples); +DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); +DECLARE_REFER_KERNEL(VSub, XYZNTuples); + +#undef DECLARE_REFER_KERNEL } // namespace refer } // namespace jit diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 4d7970414f..dba7e754ea 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -48,18 +48,20 @@ void ExpectEQ(const T* target, const T* refer, int n) { std::vector TestSizes() { std::vector s; - for (int i = 1; i < 30; ++i) { + for (int i = 1; i < 10; ++i) { s.push_back(i); } - // test some large size - s.push_back(100); - s.push_back(1000); + // // test some large size + // s.push_back(100); + // s.push_back(1000); + // s.push_back(2000); return s; } -template -void TestTartgetFunc(const Func tgt, const std::vector& x, - const std::vector& y, const std::vector& zref) { +template +void TestTartgetFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(zref.size(), x.size()); EXPECT_EQ(zref.size(), y.size()); @@ -83,13 +85,13 @@ void TestTartgetFunc(const Func tgt, const std::vector& x, ExpectEQ(ztgt_data, zref_data, d); } -TEST(JitKernel, vmul) { - using T = float; - using PlaceType = paddle::platform::CPUPlace; +template +void TestXYZNKernel() { namespace jit = paddle::operators::jit; - const auto KT = jit::vmul; for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT) + << ", size: " << d; + auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -114,10 +116,10 @@ TEST(JitKernel, vmul) { ExpectEQ(yinp_data, zref_data, d); // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); + auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { - VLOG(10) << "Test jitcode, size: " << d; - TestTartgetFunc::func_type>(jitcode, x, y, zref); + VLOG(10) << "Test Jitcode Kernel, size: " << d; + TestTartgetFunc>(jitcode, x, y, zref); } // test all impls in more @@ -127,20 +129,45 @@ TEST(JitKernel, vmul) { if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast>*>( + auto i = dynamic_cast>*>( impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel, size: " << d; - TestTartgetFunc::func_type>(more, x, y, zref); + TestTartgetFunc>(more, x, y, zref); } } } // Test result from Get function VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestTartgetFunc::func_type>(tgt, x, y, zref); + auto tgt = jit::Get, PlaceType>(d); + TestTartgetFunc>(tgt, x, y, zref); } } -TEST(JitKernel, pool) {} +TEST(JITKernel, vmul) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + // TODO(TJ): fix double issue + // TestXYZNKernel(); +} + +TEST(JITKernel, vadd) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vaddrelu) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vsub) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, pool) {} diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index e0b2e3c7fa..eaca02ba14 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -23,36 +23,6 @@ namespace operators { namespace math { namespace jitkernel { namespace refer { -/* Refer code only focus on correctness */ - -template -void VMul(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - -template -void VAdd(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - -template -void VAddRelu(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} - -template -void VScal(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] * x[i]; - } -} template void VAddBias(const T* a, const T* x, T* y, int n) { From f95ee9c09fa8459516df47bf3e72f54ab19afa66 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Wed, 12 Dec 2018 20:32:15 +0800 Subject: [PATCH 267/719] fix nccl dist test acc (#14867) * fix nccl dist test acc test=develop * fix test=develop --- .../fluid/tests/unittests/dist_mnist.py | 2 +- .../fluid/tests/unittests/test_dist_base.py | 21 ++++++++++++------- .../fluid/tests/unittests/test_dist_mnist.py | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 1cda2711f7..1c45a10a9d 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -93,7 +93,7 @@ class TestDistMnist2x2(TestDistRunnerBase): # TODO(typhoonzero): fix distributed adam optimizer # opt = fluid.optimizer.AdamOptimizer( # learning_rate=0.001, beta1=0.9, beta2=0.999) - opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) + opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) # Reader train_reader = paddle.batch( diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 26fa20291b..cedb3383ed 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -32,7 +32,7 @@ DEFAULT_BATCH_SIZE = 2 class TestDistRunnerBase(object): - def get_model(self, batch_size=DEFAULT_BATCH_SIZE): + def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1): raise NotImplementedError( "get_model should be implemented by child classes.") @@ -56,6 +56,7 @@ class TestDistRunnerBase(object): return t def run_pserver(self, args): + self.lr = args.lr self.get_model(batch_size=args.batch_size) # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, @@ -71,6 +72,7 @@ class TestDistRunnerBase(object): exe.run(pserver_prog) def run_trainer(self, args): + self.lr = args.lr test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size) @@ -189,6 +191,7 @@ def runtime_main(test_class): parser.add_argument( '--use_reader_alloc', action='store_true', required=False) parser.add_argument('--batch_size', required=False, type=int, default=2) + parser.add_argument('--lr', required=False, type=float, default=0.001) parser.add_argument( '--batch_merge_repeat', required=False, type=int, default=1) @@ -234,6 +237,7 @@ class TestDistBase(unittest.TestCase): self._dc_asgd = False # must use with async mode self._use_reader_alloc = True self._nccl2_mode = False + self._lr = 0.001 self._setup_config() self._after_setup_config() @@ -284,7 +288,8 @@ class TestDistBase(unittest.TestCase): batch_size=DEFAULT_BATCH_SIZE, batch_merge_repeat=1): - cmd = "%s %s --role trainer" % (self._python_interp, model) + cmd = "%s %s --role trainer --lr %f" % (self._python_interp, model, + self._lr) if batch_size != DEFAULT_BATCH_SIZE: cmd += " --batch_size %d" % batch_size if batch_merge_repeat > 1: @@ -330,13 +335,13 @@ class TestDistBase(unittest.TestCase): ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, ps0_ep, self._trainers) + 0, ps0_ep, self._trainers, self._lr) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, ps1_ep, self._trainers) + 1, ps1_ep, self._trainers, self._lr) if self._sync_mode: tr0_cmd += " --sync_mode" @@ -425,13 +430,13 @@ class TestDistBase(unittest.TestCase): worker_endpoints = self._ps_endpoints.split(",") w0_ep, w1_ep = worker_endpoints - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, w0_ep) + 0, w0_ep, self._lr / 2) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, w1_ep) + 1, w1_ep, self._lr / 2) if self._mem_opt: tr0_cmd += " --mem_opt" diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 630bed198f..49a2ca40e3 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -36,7 +36,7 @@ class TestDistMnistNCCL2(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnist2x2Lars(TestDistBase): From a37038880e03bc0a985b547bae78f1d0a5d7b048 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 12:34:08 +0000 Subject: [PATCH 268/719] fix unit test with double type --- paddle/fluid/operators/jit/helper.h | 44 ++++++++++++++++++----------- paddle/fluid/operators/jit/test.cc | 13 ++++----- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 124587b143..053e5ed079 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -33,8 +33,11 @@ namespace jit { #define EXP_MAX_INPUT 40.0 template -inline typename KernelTuples::func_type GetJitCode( - typename KernelTuples::attr_type attr) { +inline typename std::enable_if< + std::is_same::value && + std::is_same::value, + typename KernelTuples::func_type>::type +GetJitCode(typename KernelTuples::attr_type attr) { using Func = typename KernelTuples::func_type; using Attr = typename KernelTuples::attr_type; size_t key = JitCodeKey(attr); @@ -45,21 +48,19 @@ inline typename KernelTuples::func_type GetJitCode( // creator is not related with attr, so can use KernelKey as key KernelKey kkey(KT, PlaceType()); - if (std::is_same::value) { - // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); - auto iter = creator_map.find(kkey); - if (iter != creator_map.end()) { - auto& creators = iter->second; - for (auto& cur : creators) { - auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { - auto p = i->CreateJitCode(attr); - if (p) { - auto f = p->template getCode(); - codes.Insert(key, std::move(p)); - return f; - } + // pool: (KernelKey(type, place), vector) + auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto iter = creator_map.find(kkey); + if (iter != creator_map.end()) { + auto& creators = iter->second; + for (auto& cur : creators) { + auto i = dynamic_cast*>(cur.get()); + if (i && i->UseMe(attr)) { + auto p = i->CreateJitCode(attr); + if (p) { + auto f = p->template getCode(); + codes.Insert(key, std::move(p)); + return f; } } } @@ -67,6 +68,15 @@ inline typename KernelTuples::func_type GetJitCode( return nullptr; } +template +inline typename std::enable_if< + !std::is_same::value || + !std::is_same::value, + typename KernelTuples::func_type>::type +GetJitCode(typename KernelTuples::attr_type attr) { + return nullptr; +} + // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace template diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index dba7e754ea..9ceca24079 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -48,13 +48,13 @@ void ExpectEQ(const T* target, const T* refer, int n) { std::vector TestSizes() { std::vector s; - for (int i = 1; i < 10; ++i) { + for (int i = 1; i < 32; ++i) { s.push_back(i); } - // // test some large size - // s.push_back(100); - // s.push_back(1000); - // s.push_back(2000); + // test some large size + s.push_back(100); + s.push_back(1000); + s.push_back(2000); return s; } @@ -148,8 +148,7 @@ void TestXYZNKernel() { TEST(JITKernel, vmul) { namespace jit = paddle::operators::jit; TestXYZNKernel(); - // TODO(TJ): fix double issue - // TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, vadd) { From 8b9d33fa1e7c3d592d9f3976634eaf87155f1a49 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 12:32:26 +0000 Subject: [PATCH 269/719] add unittest and fix bug add API.spec test=develop --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/py_func_op.cc | 123 ++++++--- python/paddle/fluid/layers/nn.py | 242 +++++++++++------- .../fluid/tests/unittests/test_py_func_op.py | 145 +++++++++++ 4 files changed, 374 insertions(+), 137 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_py_func_op.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2722ea078e..b3f7593be3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -197,6 +197,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 46a6125f97..32c44c3bc2 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -26,26 +26,35 @@ namespace py = pybind11; static std::vector g_py_callables; +const char kForwardPythonCallableId[] = "forward_callable_id"; +const char kBackwardPythonCallableId[] = "backward_callable_id"; +const char kPyFuncBackwardSkipVars[] = "backward_skip_vars"; + size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { g_py_callables.emplace_back(py_obj); return g_py_callables.size() - 1; } static py::object *GetPythonCallableObject(size_t i) { - PADDLE_ENFORCE_LT(i, g_py_callables.size()); + PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id"); return &g_py_callables[i]; } -void CallPythonFunc(py::object *callable, const std::string &func_token, +std::string PythonObjectToString(const py::object &py_callable) { + py::gil_scoped_acquire guard; + return py::str(*py_callable); +} + +void CallPythonFunc(py::object *callable, const std::vector &ins, std::vector *out) { - py::gil_scoped_acquire guard{}; + py::gil_scoped_acquire guard; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr); } - auto ret = (*callable)(func_token, *in_args); + auto ret = (*callable)(*in_args); auto ret_tuple = py::cast(ret); PADDLE_ENFORCE_EQ(py::len(ret_tuple), out->size(), "Output number not match"); for (size_t i = 0; i < out->size(); ++i) { @@ -55,7 +64,7 @@ void CallPythonFunc(py::object *callable, const std::string &func_token, try { auto *out_tensor = py::cast(ret_tuple[i]); PADDLE_ENFORCE_NOT_NULL(out_tensor, - "Output tensor should not be nullptr"); + "Output tensor %d should not be nullptr", i); (*out)[i]->set_lod(out_tensor->lod()); (*out)[i]->ShareDataWith(*out_tensor); } catch (py::cast_error &) { @@ -69,26 +78,23 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { void operator()(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(!ctx->IsRuntime(), "Infer shape cannot be called in runtime."); - PADDLE_ENFORCE(ctx->HasInputs("X"), "Input(X) must exist"); - PADDLE_ENFORCE(ctx->HasOutputs("Out"), "Output(Out) must exist"); + PADDLE_ENFORCE(ctx->HasInputs("X") || ctx->HasOutputs("Out"), + "Input(X) or Output(Out) must exist"); + PADDLE_ENFORCE_GE(ctx->Attrs().Get(kForwardPythonCallableId), 0, + "Function id cannot be less than 0"); auto *op = boost::get(ctx->GetOp()); auto *block = op->Block(); - // No need to infer shape in forward part - if (block->ForwardBlockID() < 0) { - return; - } - - PADDLE_ENFORCE(!ctx->Attrs().Get("token").empty(), - "Function token cannot be empty"); - const std::string kGradVarSuffix = framework::kGradVarSuffix; auto out_vars = ctx->GetOutputVarPtrs("Out"); for (auto &out_var : out_vars) { auto *out_var_desc = boost::get(out_var); + if (out_var_desc == nullptr) { + continue; + } auto out_name = out_var_desc->Name(); if (out_name == framework::kEmptyVarName || - out_name.size() < kGradVarSuffix.size()) { + out_name.size() <= kGradVarSuffix.size()) { continue; } @@ -98,6 +104,8 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { auto *in_var_desc = block->FindVarRecursive(fwd_var_name); PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found", fwd_var_name); + VLOG(10) << "Infer shape of Out(" << out_name << ") as Input(" + << in_var_desc->Name() << ")"; out_var_desc->SetShape(in_var_desc->GetShape()); out_var_desc->SetDataType(in_var_desc->GetDataType()); out_var_desc->SetLoDLevel(in_var_desc->GetLoDLevel()); @@ -112,13 +120,15 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Inputs of py_func op.").AsDuplicable(); AddOutput("Out", "Outputs of py_func op").AsDuplicable(); - AddAttr("handle_idx", "Index of the registered py_func handle") + AddAttr(kForwardPythonCallableId, + "Index of registered forward Python function.") .SetDefault(0); - AddAttr("token", "Token of function token to be called") - .SetDefault(""); - AddAttr("backward_token", - "Token of backward function to be called") - .SetDefault(""); + AddAttr(kBackwardPythonCallableId, + "Index of registered backward Python function") + .SetDefault(-1); + AddAttr>(kPyFuncBackwardSkipVars, + "Unused forward in/out in backward op") + .SetDefault(std::vector()); AddComment(R"DOC("PyFunc Op")DOC"); } }; @@ -129,7 +139,8 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { std::vector> operator()() const override { auto &fwd_attrs = Attrs(); - if (fwd_attrs.at("backward_token").empty()) { + // no backward op when backward_id is less than 0 + if (boost::get(fwd_attrs.at(kBackwardPythonCallableId)) < 0) { return {}; } @@ -137,36 +148,65 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { grad_op->SetType("py_func"); framework::AttributeMap bwd_attrs; - bwd_attrs["token"] = fwd_attrs.at("backward_token"); - bwd_attrs["backward_token"] = std::string(""); + bwd_attrs[kForwardPythonCallableId] = + fwd_attrs.at(kBackwardPythonCallableId); + bwd_attrs[kBackwardPythonCallableId] = -1; grad_op->SetAttrMap(bwd_attrs); - auto bwd_in = Input("X"); - auto fwd_out = Output("Out"); - auto fwd_out_grad = OutputGrad("Out"); - bwd_in.insert(bwd_in.end(), fwd_out.begin(), fwd_out.end()); - bwd_in.insert(bwd_in.end(), fwd_out_grad.begin(), fwd_out_grad.end()); + // All forward inputs + auto fwd_ins = Input("X"); + // All forward outputs + auto fwd_outs = Output("Out"); + + // For memory reused, some inputs/output in forward part may be not needed + // in backward part + // Just skip these vars + auto &backward_skip_var_list = boost::get>( + fwd_attrs.at(kPyFuncBackwardSkipVars)); + std::unordered_set backward_skip_var_set( + backward_skip_var_list.begin(), backward_skip_var_list.end()); + std::vector bwd_ins; + bwd_ins.reserve(fwd_ins.size() + fwd_outs.size()); + for (auto &fwd_in : fwd_ins) { + if (backward_skip_var_set.count(fwd_in) == 0) { + bwd_ins.emplace_back(fwd_in); + } + } + + for (auto &fwd_out : fwd_outs) { + if (backward_skip_var_set.count(fwd_out) == 0) { + bwd_ins.emplace_back(fwd_out); + } + } + + // Backward OG cannot be skipped + // But in Python side, if OG is kEmptyVarName, input tensor would be None + auto fwd_out_grads = OutputGrad("Out"); + bwd_ins.reserve(bwd_ins.size() + fwd_out_grads.size()); + bwd_ins.insert(bwd_ins.end(), fwd_out_grads.begin(), fwd_out_grads.end()); - auto bwd_out = InputGrad("X", false); + // Backward IG cannot be skipped + // But in Python side, if IG is not needed, users can just return None + auto bwd_outs = InputGrad("X", false); if (VLOG_IS_ON(10)) { std::string in_str = "PyFunc Grad Input: "; - for (auto &in : bwd_in) { + for (auto &in : bwd_ins) { in_str += in; in_str += " "; } VLOG(10) << in_str; std::string out_str = "PyFunc Grad Output: "; - for (auto &out : bwd_out) { + for (auto &out : bwd_outs) { out_str += out; - out += " "; + out_str += " "; } VLOG(10) << out_str; } - grad_op->SetInput("X", bwd_in); - grad_op->SetOutput("Out", InputGrad("X", false)); + grad_op->SetInput("X", bwd_ins); + grad_op->SetOutput("Out", bwd_outs); std::vector> ret(1); ret[0] = std::move(grad_op); @@ -210,12 +250,11 @@ class PyFuncOp : public framework::OperatorBase { outputs[i] = out_tensor; } - auto &token = Attr("token"); - auto handle_idx = static_cast(Attr("handle_idx")); - auto *py_callable = GetPythonCallableObject(handle_idx); - VLOG(10) << "Call py_func_op with token " << token << ", and handle_idx " - << handle_idx; - CallPythonFunc(py_callable, token, inputs, &outputs); + auto callable_id = static_cast(Attr(kForwardPythonCallableId)); + auto *py_callable = GetPythonCallableObject(callable_id); + VLOG(10) << "Call py_func_op with id " << callable_id << ": " + << PythonObjectToString(*py_callable); + CallPythonFunc(py_callable, inputs, &outputs); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 66c98c935d..95f046c614 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9087,104 +9087,140 @@ def get_tensor_from_selected_rows(x, name=None): return out -@templatedoc() -def py_func(func, x, out, backward_func=None): - """ - """ - - class PyFuncRegister(object): - _main_program_to_register = dict() - - @classmethod - def get_instance(cls, prog): - if not isinstance(prog, Program): - raise TypeError("prog must be type of Program") - - ret = cls._main_program_to_register.get(prog, None) - if ret is None: - ret = PyFuncRegister() - ret._idx = core.append_python_callable_object_and_return_id(ret) - ret._token_func_dict = dict() - ret._func_token_dict = dict() - cls._main_program_to_register[prog] = ret - - return ret - - @property - def handle_idx(self): - return self._idx - - def unique_token(self, func): - return self._register_func(func) - - def _register_func(self, func): - if func is None: - raise ValueError("func cannot be None") - - token = self._func_token_dict.get(func, None) - if token is not None: - return token - - token = unique_name.generate('py_func_op_token') - self._token_func_dict[token] = func - self._func_token_dict[func] = token - return token - - def __call__(self, token, *args): - func = self._token_func_dict.get(token, None) - if func is None: - raise ValueError("func has not been registered") - - arg_list = inspect.getargspec(func) - kwargs = dict() - idx = 0 - for arg in arg_list[0]: - kwargs[arg] = args[idx] - idx += 1 - - args = args[idx:] - ret0 = func(*args, **kwargs) - if ret0 is None: - return None - - if not isinstance(ret0, (list, tuple)): - ret0 = (ret0, ) - - ret = [] - for i in six.moves.range(len(ret0)): - if ret0[i] is None: - ret.append(None) - continue - - if isinstance(ret0[i], core.LoDTensor): - ret.append(ret0[i]) - continue +class PyFuncWrapper(object): + _register_funcs = [] + + def __init__(self, func): + if func is None or not hasattr(func, '__call__'): + raise TypeError('func must be a Python function') + + self._func = func + # find named args using reflection + self._named_args = inspect.getargspec(self._func)[0] + self._id = core.append_python_callable_object_and_return_id(self) + ''' + Why record self here? + + 1. For debug usage. Users can call + :code:`py_func.registered_func(idx)` method + to find the registered function coresponding + to :code:`idx`. + + 2. For increasing reference count of self. + It seems that to release Python object + whose reference count is 1 would cause + segmentation fault error in C++ side. + May be lack of Python GC in C++ side? + ''' + PyFuncWrapper._register_funcs.append(self) + + @classmethod + def registered_func(cls, idx): + return cls._register_funcs[idx]._func + + @classmethod + def registered_func_num(cls): + return len(cls._register_funcs) + + @property + def id(self): + return self._id + + def __call__(self, *args): + kwargs = dict() + idx = 0 + for arg in self._named_args: + kwargs[arg] = args[idx] + idx += 1 + + ret0 = self._func(*args[idx:], **kwargs) + if ret0 is None: + return None + + if not isinstance(ret0, (list, tuple)): + ret0 = (ret0, ) + + ret = [] + for i in six.moves.range(len(ret0)): + if ret0[i] is None: + ret.append(None) + continue + + if isinstance(ret0[i], core.LoDTensor): + ret.append(ret0[i]) + continue + + if isinstance(ret0[i], np.ndarray): + r = ret0[i] + else: + r = np.array(ret0[i]) - if isinstance(ret0[i], np.ndarray): - r = ret0[i] - else: - r = np.array(ret0[i]) + t = core.LoDTensor() + t.set(r, core.CPUPlace()) + ret.append(t) - t = core.LoDTensor() - t.set(r, core.CPUPlace()) - ret.append(t) + return tuple(ret) - return tuple(ret) +@templatedoc() +def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): + """ + PyFunc Operator. + + User can use :code:`py_func` to register operators in Python side. + The inputs of :code:`func` is :code:`LoDTensor` and outputs can be + numpy array or :code:`LoDTensor`. Paddle would call the registered + :code:`func` in forward part, and call :code:`backward_func` in + backward part (if :code:`backward_func` is not None). + + User should set the right data type and shape of :code:`out` before + calling this function. However, data types and shapes of gradients of + :code:`out` and :code:`x` would be infered automatically. + + The orders of inputs of :code:`backward_func` would be: forward input + :code:`x`, forward output :code:`out` and backward input gradient of + :code:`out`. If some variables of :code:`out` have no gradient, the input + tensor would be None in Python side. If some variables of :code:`in` have + no gradient, users should return None. + + Args: + func (callable): forward Python function. + x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`. + out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`. + Paddle cannot infer shapes and data types of :code:`out`. Users + should create :code:`out` beforehand. + backward_func (callable|None): backward Python function. + None means no backward. Default None. + skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)): + Variables that are not needed in :code:`backward_func` inputs. + These variables must be any of :code:`x` and :code:`out`. + If set, these vars would not be inputs of :code:`backward_func`, + Only useful when :code:`backward_func` is not None. Default None. + + Returns: + out (Variable|list(Variable)|tuple(Variable)): input :code:`out` + """ helper = LayerHelper('py_func', **locals()) - if isinstance(x, Variable): + if x is None: + x = [] + elif isinstance(x, Variable): x = [x] + elif not isinstance(x, (list, tuple)): + raise TypeError('Input must be Variable/list(Variable)/tuple(Variable)') - if isinstance(out, Variable): + if out is None: + out_list = [] + elif isinstance(out, Variable): out_list = [out] - else: + elif isinstance(out, (list, tuple)): out_list = out + else: + raise TypeError( + 'Output must be Variable/list(Variable)/tuple(Variable)') - if func is None or not hasattr(func, '__call__'): - raise TypeError('Input func must be a function') - - if backward_func is not None and not hasattr(backward_func, '__call__'): - raise TypeError('Input backward_func must be a function') + fwd_func_id = PyFuncWrapper(func).id + bwd_func_id = PyFuncWrapper( + backward_func).id if backward_func is not None else -1 for each_out in out_list: if len(each_out.shape) == 0: @@ -9192,18 +9228,34 @@ def py_func(func, x, out, backward_func=None): 'Output shapes of py_func op should be provided by users manually' ) - py_func_reg = PyFuncRegister.get_instance(helper.main_program) - forward_token = py_func_reg.unique_token(func) - backward_token = py_func_reg.unique_token( - backward_func) if backward_func is not None else '' + backward_skip_vars = set() + if backward_func is not None and skip_vars_in_backward_input is not None: + if isinstance(skip_vars_in_backward_input, Variable): + skip_vars_in_backward_input = [skip_vars_in_backward_input] + + fwd_in_out = [v.name for v in x] + fwd_in_out.extend([v.name for v in out_list]) + fwd_in_out = set(fwd_in_out) + backward_skip_vars = set() + for v in skip_vars_in_backward_input: + if not v.name in fwd_in_out: + raise ValueError( + 'Variable {} is not found in forward inputs and outputs' + .format(v.name)) + backward_skip_vars.add(v.name) helper.append_op( type='py_func', inputs={'X': x}, outputs={'Out': out_list}, attrs={ - 'handle_idx': py_func_reg.handle_idx, - 'token': forward_token, - 'backward_token': backward_token + 'forward_callable_id': fwd_func_id, + 'backward_callable_id': bwd_func_id, + 'backward_skip_vars': list(backward_skip_vars) }) return out + + +# For debug usage +py_func.registered_func = PyFuncWrapper.registered_func +py_func.registered_func_num = PyFuncWrapper.registered_func_num diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py new file mode 100644 index 0000000000..0f03368b7e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -0,0 +1,145 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle +import unittest +import six +import numpy as np + + +def tanh(x): + return np.tanh(x) + + +def tanh_grad(y, dy): + return np.array(dy) * (1 - np.square(np.array(y))) + + +def cross_entropy(logits, labels): + logits = np.array(logits) + labels = np.array(labels) + M = logits.shape[0] + N = logits.shape[1] + ret = np.ndarray([M, 1]).astype(logits.dtype) + for idx in six.moves.range(M): + ret[idx][0] = -np.log(logits[idx][labels[idx][0]]) + return ret + + +def cross_entropy_grad(logits, labels, bwd_dout): + logits = np.array(logits) + labels = np.array(labels) + bwd_dout = np.array(bwd_dout) + M = logits.shape[0] + N = logits.shape[1] + dlogits = np.zeros([M, N]).astype(logits.dtype) + for idx in six.moves.range(M): + dlogits[idx][labels[idx][0]] = -bwd_dout[idx] / logits[idx][labels[idx][ + 0]] + return dlogits, None + + +def simple_fc_net(img, label, use_py_func_op): + hidden = img + for idx in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + if use_py_func_op: + hidden = fluid.layers.tanh(hidden) + else: + new_hidden = fluid.default_main_program().current_block( + ).create_var( + name='hidden_{}'.format(idx), + dtype='float32', + shape=hidden.shape) + hidden = fluid.layers.py_func( + func=tanh, + x=hidden, + out=new_hidden, + backward_func=tanh_grad, + skip_vars_in_backward_input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + if not use_py_func_op: + loss = fluid.layers.cross_entropy(input=prediction, label=label) + else: + loss = fluid.default_main_program().current_block().create_var( + name='loss', dtype='float32', shape=[-1, 1]) + fluid.layers.py_func( + func=cross_entropy, + x=[prediction, label], + out=loss, + backward_func=cross_entropy_grad, + skip_vars_in_backward_input=loss) + loss = fluid.layers.mean(loss) + return loss + + +def reader(): + for _ in six.moves.range(100): + yield np.random.random([784]), np.random.random_integers( + size=[1], low=0, high=9) + + +def test_main(use_cuda, use_py_func_op): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return None + + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(fluid.core.Scope()): + fluid.default_main_program().random_seed = 1 + fluid.default_startup_program().random_seed = 1 + np.random.seed(1) + + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + loss = simple_fc_net(img, label, use_py_func_op) + optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[img, label], place=place) + r = paddle.batch(reader, batch_size=10) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + ret = [] + for epoch_id in six.moves.range(2): + for d in r(): + L, = exe.run(feed=feeder.feed(d), fetch_list=[loss]) + ret.append(L[0]) + + return np.array(ret) + + +class TestPyFuncOp(unittest.TestCase): + def test_loss_diff(self): + losses = [] + for use_cuda in [True, False]: + for use_py_func_op in [True, False]: + L = test_main(use_cuda, use_py_func_op) + if L is not None: + losses.append(L) + + for idx in six.moves.range(len(losses) - 1): + max_diff = np.max(np.abs(losses[idx] - losses[0])) + self.assertAlmostEqual(max_diff, 0, delta=1e-3) + + +if __name__ == '__main__': + unittest.main() From e7c5c9d2de9f51e5d403c879a31c0297e0f40656 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 12:41:47 +0000 Subject: [PATCH 270/719] remove unnecesary code test=develop --- paddle/fluid/pybind/pybind.cc | 14 -------------- python/paddle/fluid/layers/nn.py | 2 +- .../fluid/tests/unittests/test_py_func_op.py | 2 +- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 38b1308330..348a073915 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -572,20 +572,6 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Place") .def(py::init<>()) - .def("is_cpu_place", - [](platform::Place &self) { return platform::is_cpu_place(self); }) - .def("is_gpu_place", - [](platform::Place &self) { return platform::is_gpu_place(self); }) - .def("is_cuda_pinned_place", - [](platform::Place &self) { - return platform::is_cuda_pinned_place(self); - }) - .def("gpu_device_id", - [](platform::Place &self) { - PADDLE_ENFORCE(platform::is_gpu_place(self), - "gpu_device_id() only supports in CUDAPlace"); - return boost::get(self).device; - }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6555025001..d71368644d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -23,7 +23,7 @@ import os import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant -from ..framework import Variable, OpProtoHolder, Program +from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .tensor import concat diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 0f03368b7e..c71f2bdea8 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -59,7 +59,7 @@ def simple_fc_net(img, label, use_py_func_op): size=200, bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=1.0))) - if use_py_func_op: + if not use_py_func_op: hidden = fluid.layers.tanh(hidden) else: new_hidden = fluid.default_main_program().current_block( From 582011ba76baab02dcc9fbaf3536d781ba59dc3b Mon Sep 17 00:00:00 2001 From: chengduo Date: Wed, 12 Dec 2018 21:08:02 +0800 Subject: [PATCH 271/719] Add L2 unit test (#14792) * add l2 unit test test=develop * code refine test=develop --- .../fluid/tests/unittests/test_regularizer.py | 136 +++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py index 20f91cf448..62994eec7e 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer.py @@ -15,7 +15,12 @@ from __future__ import print_function import unittest - +from functools import partial +import contextlib +import numpy as np +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid import paddle.fluid.framework as framework import paddle.fluid.optimizer as optimizer import paddle.fluid.regularizer as regularizer @@ -97,5 +102,134 @@ class TestL1DecayRegularizer(unittest.TestCase): self.assertEqual(block.ops[-3].type, 'sign') +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestRegularizer(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=8)() + self.train_data = [next(reader) for _ in range(5)] + + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + @contextlib.contextmanager + def scope_prog_guard(self, main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + def run_program(self, place, feed_list): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + main_prog = fluid.default_main_program() + param_list = [var.name for var in main_prog.block(0).all_parameters()] + + param_sum = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=param_list) + p_sum = 0 + for v in out: + p_sum += np.sum(np.abs(v)) + param_sum.append(p_sum) + return param_sum + + def check_l2decay_regularizer(self, place, model): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with self.scope_prog_guard( + main_prog=main_prog, startup_prog=startup_prog): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + optimizer = fluid.optimizer.Adagrad( + learning_rate=0.1, + regularization=fluid.regularizer.L2Decay(1.0)) + optimizer.minimize(avg_cost) + param_sum = self.run_program(place, [data, label]) + return param_sum + + def check_l2decay(self, place, model): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with self.scope_prog_guard( + main_prog=main_prog, startup_prog=startup_prog): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost_l2 = model(data, label, len(self.word_dict)) + + param_list = fluid.default_main_program().block(0).all_parameters() + para_sum = [] + for para in param_list: + para_mul = fluid.layers.square(x=para) + para_sum.append(fluid.layers.reduce_sum(input=para_mul)) + avg_cost_l2 += fluid.layers.sums(para_sum) * .5 + + optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) + optimizer.minimize(avg_cost_l2) + param_sum = self.run_program(place, [data, label]) + return param_sum + + def test_l2(self): + for place in self.get_places(): + dense_sparse_p_sum = [] + for sparse in [True, False]: + model = partial(bow_net, is_sparse=sparse) + framework_l2 = self.check_l2decay_regularizer(place, model) + l2 = self.check_l2decay(place, model) + assert len(l2) == len(framework_l2) + for i in range(len(l2)): + assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5) + dense_sparse_p_sum.append(framework_l2) + + assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1]) + for i in range(len(dense_sparse_p_sum[0])): + assert np.isclose( + a=dense_sparse_p_sum[0][i], + b=dense_sparse_p_sum[1][i], + rtol=5e-5) + + if __name__ == '__main__': unittest.main() From 7bd16e3afad479a559fba9321581def2c5d90165 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Wed, 12 Dec 2018 21:47:51 +0800 Subject: [PATCH 272/719] fix some bug & add log --- paddle/fluid/framework/async_executor.cc | 2 +- .../fluid/framework/executor_thread_worker.cc | 28 +++++++++++++------ .../fluid/framework/executor_thread_worker.h | 2 +- python/paddle/fluid/async_executor.py | 3 +- .../paddle/fluid/contrib/utils/hdfs_utils.py | 5 +++- 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index f0ca375f95..6efe5cafe7 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -111,7 +111,7 @@ void AsyncExecutor::InitParamConfig() { std::vector tmp_sparse_variable_name; for (int i = 0u; i < table.slot_value_size(); ++i) { tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_value(i)] = table.table_id(); + _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); } std::vector tmp_sparse_gradient_variable_name; for (auto i = 0u; i < table.slot_gradient_size(); ++i) { diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index a0455b26ef..7004ecf23b 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -330,6 +330,7 @@ void AsyncExecutorThreadWorker::TrainFiles() { print_fetch_var(thread_scope_, fetch_var_names_[i]); } // end for (int i = 0...) } // end while () + LOG(ERROR) << "TRAIN DONE"; } void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { @@ -571,25 +572,30 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { void AsyncExecutorThreadWorker::PushSparse(int table_id) { auto slot_dim = _param_config->slot_dim; //TODO auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO - auto& features = _features[table_id]; + auto& features = _features[table_id]; + CHECK(features.size() < 1000000) << "features size:" << features.size(); //std::vector gradient_var; //auto& gradient_var = GlobalConfig::instance().input_gradient_variable_name; //TODO - auto& push_g = _feature_push_value[table_id]; + auto& push_g = _feature_push_value[table_id]; check_pull_push_memory(features, push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) << "push_g size:" << push_g.size() << " features size:" << features.size(); uint64_t fea_idx = 0u; - auto& fea_info = _fea_info[table_id]; //TODO + auto& fea_info = _fea_info[table_id]; int offset = 0; //if (!_current_train_job.use_cvm_feature()) { //TODO offset = 2; //} - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx << " name:" << feed_vec[slot_idx]; + } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + LOG(ERROR) << "ERROR continue"; continue; } - Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); + Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); + CHECK(g_var != nullptr) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; LoDTensor* g_tensor = g_var->GetMutable(); if (g_tensor == NULL) { LOG(ERROR) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; @@ -598,13 +604,16 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { float* g = g_tensor->data(); Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; LoDTensor* tensor = var->GetMutable(); if (tensor == NULL) { LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; exit(-1); } - int len = tensor->lod()[0].back(); - assert(slot_dim * len == g_tensor->numel()); + //int len = tensor->lod()[0].back(); + int len = tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) << "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << len << "t_numel:" << tensor->numel(); int64_t* ids = tensor->data(); for (auto id_idx = 0u; id_idx < len; ++id_idx){ if (ids[id_idx] == 0) { @@ -613,12 +622,13 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { } memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); push_g[fea_idx][0] = 1.0f; + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx << " size:" << fea_info.size(); push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); g += slot_dim; fea_idx++; } } - assert(fea_idx == features.size()); + CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx << " features size:" << features.size(); CHECK(features.size() > 0); std::vector push_g_vec; diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index b3ee9dfaec..0c9a47690b 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -49,7 +49,7 @@ struct AsyncWorkerParamConfig { std::vector sparse_table_id; std::map> slot_input_vec; //6048slot 6050slot //name std::map> gradient_var; //6048slot_embed - std::unordered_map slot_alias_to_table; //TODO done + std::map slot_alias_to_table; //TODO done }; struct DensePullThreadParam { diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index af42d2912f..13d876e57b 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -153,7 +153,7 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, mode, debug) - def download_data(self, afs_path, local_path, fs_default_name, ugi, hadoop_home="$HADOOP_HOME", process_num=12): + def download_data(self, afs_path, local_path, fs_default_name, ugi, file_cnt, hadoop_home="$HADOOP_HOME", process_num=12): if self.instance is None: raise ValueError('instance is None, please run config_distributed_nodes init instance') @@ -169,6 +169,7 @@ class AsyncExecutor(object): local_path, self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, + file_cnt, multi_processes=process_num) #self.instance.barrier_all() #wait for download_data #TODO only barriere worker self.instance.barrier_worker() #wait for download_data #TODO only barriere worker diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index ff1a2d3e4a..42b4d7feab 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -427,6 +427,7 @@ def multi_download(client, local_path, trainer_id, trainers, + file_cnt, multi_processes=5): """ multi_download @@ -435,6 +436,7 @@ def multi_download(client, :param local_path: path on local :param trainer_id: current trainer id :param trainers: all trainers number + :param file_cnt: all file number :param multi_processes: the download data process at the same time, default=5 :return: None """ @@ -450,7 +452,7 @@ def multi_download(client, client.make_local_dirs(local_path) _logger.info("Make local dir {} successfully".format(local_path)) - all_need_download = client.lsr(hdfs_path, sort=True) + all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt] need_download = all_need_download[trainer_id::trainers] _logger.info("Get {} files From all {} files need to be download from {}". format(len(need_download), len(all_need_download), hdfs_path)) @@ -501,6 +503,7 @@ if __name__ == "__main__": "/home/xx/data1", 1, 5, + 100, multi_processes=5) multi_upload(client, "/user/com/train-25/model", "/home/xx/data1") From e9216e82f915f0fdd96cc539040c7c75a2d71113 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 12 Dec 2018 13:55:27 +0000 Subject: [PATCH 273/719] add refer vscal, vaddbias and test and benchmark --- paddle/fluid/operators/jit/README.md | 8 +- paddle/fluid/operators/jit/benchmark.cc | 89 +++++++++++++-- paddle/fluid/operators/jit/helper.cc | 4 + paddle/fluid/operators/jit/kernel_base.h | 13 ++- .../fluid/operators/jit/refer/CMakeLists.txt | 5 + paddle/fluid/operators/jit/refer/refer.cc | 3 + paddle/fluid/operators/jit/refer/refer.h | 12 ++ paddle/fluid/operators/jit/test.cc | 103 ++++++++++++++++-- .../fluid/operators/math/jit_kernel_refer.h | 7 -- 9 files changed, 216 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index c2e32cc49b..2d72aa4d56 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -37,10 +37,12 @@ PaddlePaddle/Paddle/paddle/fluid/ ## 测试 - 逻辑测试 - 所有实现都要与refer的code对比,需要满足精度要求 + 所有实现都要与refer的code对比,需要满足精度要求, 包括float和double的数据类型 - 性能测试 + 所有实现的性能对比,并且与最终的`jit::Get`方法对比,该方法拿到的性能需要是最好的。 # 如何添加新的算子 -- 在`KernelType` 中添加 `your_key` -- 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)` +- 在`KernelType` 中添加 `your_key` . +- 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. +- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`. diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 27a1ba7ba3..2ad87e414b 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -53,9 +53,9 @@ std::vector TestSizes() { // return this function avg time template -double BenchTartgetFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& y, - std::vector& z) { // NOLINT +double BenchXYZNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + std::vector& z) { // NOLINT const T* x_data = x.data(); const T* y_data = y.data(); const int d = z.size(); @@ -83,14 +83,14 @@ void BenchXYZNKernel() { // refer auto refer = jit::GetRefer>(); if (refer) { - auto res = BenchTartgetFunc>(refer, x, y, z); + auto res = BenchXYZNFunc>(refer, x, y, z); infos.push_back(std::make_pair("Refer", res)); } // test jitcode auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { - auto res = BenchTartgetFunc>(jitcode, x, y, z); + auto res = BenchXYZNFunc>(jitcode, x, y, z); infos.push_back(std::make_pair("JitCode", res)); } @@ -105,7 +105,7 @@ void BenchXYZNKernel() { impl.get()); if (i && i->UseMe(d)) { auto more = i->GetFunc(); - auto res = BenchTartgetFunc>(more, x, y, z); + auto res = BenchXYZNFunc>(more, x, y, z); infos.push_back(std::make_pair("More", res)); } } @@ -116,7 +116,7 @@ void BenchXYZNKernel() { if (!tgt) { LOG(ERROR) << "Target can not be empty!"; } - auto res = BenchTartgetFunc>(tgt, x, y, z); + auto res = BenchXYZNFunc>(tgt, x, y, z); infos.push_back(std::make_pair("Target", res)); // print @@ -129,6 +129,78 @@ void BenchXYZNKernel() { } } +// return this function avg time +template +double BenchAXYNFunc(const typename KernelTuples::func_type tgt, const T a, + const std::vector& x, + std::vector& y) { // NOLINT + const T* x_data = x.data(); + T* y_data = y.data(); + const int d = y.size(); + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(&a, x_data, y_data, d); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(&a, x_data, y_data, d); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchAXYNKernel() { + namespace jit = paddle::operators::jit; + for (int d : TestSizes()) { + std::vector> infos; + const T a = static_cast(3); + std::vector x(d), y(d); + RandomVec(d, x.data()); + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchAXYNFunc>(refer, a, x, y); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + auto res = BenchAXYNFunc>(jitcode, a, x, y); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + auto res = BenchAXYNFunc>(more, a, x, y); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(d); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchAXYNFunc>(tgt, a, x, y); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -147,4 +219,7 @@ int main(int argc, char* argv[]) { BenchXYZNKernel(); BenchXYZNKernel(); BenchXYZNKernel(); + + BenchAXYNKernel(); + BenchAXYNKernel(); } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 2260f0aed4..c9aaffb8b8 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/helper.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { @@ -32,7 +33,10 @@ const char* to_string(KernelType kt) { return "vscal"; case vexp: return "vexp"; + case vaddbias: + return "vaddbias"; default: + PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; } return nullptr; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index b2e9d63977..74ecf3dade 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -19,7 +19,15 @@ namespace paddle { namespace operators { namespace jit { -typedef enum { vmul = 0, vadd = 1, vaddrelu, vsub, vscal, vexp } KernelType; +typedef enum { + vmul = 0, + vadd = 1, + vaddrelu, + vsub, + vscal, + vaddbias, + vexp +} KernelType; template struct XYZNTuples { @@ -28,6 +36,9 @@ struct XYZNTuples { typedef void (*func_type)(const T*, const T*, T*, int); }; +template +struct AXYNTuples : public XYZNTuples {}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index b6ff80d03d..afe3f6ca0f 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -8,3 +8,8 @@ endfunction() # use refer kernel by name USE_JITKERNEL_REFER(vmul) +USE_JITKERNEL_REFER(vadd) +USE_JITKERNEL_REFER(vaddrelu) +USE_JITKERNEL_REFER(vsub) +USE_JITKERNEL_REFER(vscal) +USE_JITKERNEL_REFER(vaddbias) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 69d039422f..4e9c530344 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -26,4 +26,7 @@ REGISTER_REFER_KERNEL(vadd, VAdd); REGISTER_REFER_KERNEL(vaddrelu, VAddRelu); REGISTER_REFER_KERNEL(vsub, VSub); +REGISTER_REFER_KERNEL(vscal, VScal); +REGISTER_REFER_KERNEL(vaddbias, VAddBias); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 4d4d308cbd..32ac5bf2d7 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -59,6 +59,13 @@ void VScal(const T* a, const T* x, T* y, int n) { } } +template +void VAddBias(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] + x[i]; + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -66,11 +73,16 @@ void VScal(const T* a, const T* x, T* y, int n) { name##Kernel() { this->func = name; } \ } +// const T* x, const T* y, T* z, int n DECLARE_REFER_KERNEL(VMul, XYZNTuples); DECLARE_REFER_KERNEL(VAdd, XYZNTuples); DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); DECLARE_REFER_KERNEL(VSub, XYZNTuples); +// const T* a, const T* x, T* y, int n +DECLARE_REFER_KERNEL(VScal, AXYNTuples); +DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 9ceca24079..ea2cb7b7a4 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -12,7 +12,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include // for memcpy #include #include #include @@ -59,9 +58,9 @@ std::vector TestSizes() { } template -void TestTartgetFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& y, - const std::vector& zref) { +void TestXYZNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { EXPECT_TRUE(tgt != nullptr); EXPECT_EQ(zref.size(), x.size()); EXPECT_EQ(zref.size(), y.size()); @@ -88,9 +87,8 @@ void TestTartgetFunc(const typename KernelTuples::func_type tgt, template void TestXYZNKernel() { namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT) - << ", size: " << d; auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); @@ -119,7 +117,7 @@ void TestXYZNKernel() { auto jitcode = jit::GetJitCode, PlaceType>(d); if (jitcode) { VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestTartgetFunc>(jitcode, x, y, zref); + TestXYZNFunc>(jitcode, x, y, zref); } // test all impls in more @@ -134,14 +132,14 @@ void TestXYZNKernel() { if (i && i->UseMe(d)) { auto more = i->GetFunc(); VLOG(10) << "Test More Kernel, size: " << d; - TestTartgetFunc>(more, x, y, zref); + TestXYZNFunc>(more, x, y, zref); } } } // Test result from Get function VLOG(10) << "Test Get function, size: " << d; auto tgt = jit::Get, PlaceType>(d); - TestTartgetFunc>(tgt, x, y, zref); + TestXYZNFunc>(tgt, x, y, zref); } } @@ -169,4 +167,89 @@ TEST(JITKernel, vsub) { TestXYZNKernel(); } -TEST(JITKernel, pool) {} +template +void TestAXYNFunc(const typename KernelTuples::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); +} + +template +void TestAXYNKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int d : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + + const T a = static_cast(3); + std::vector x(d), yref(d); + std::vector xinp(d); // inplace test + RandomVec(d, x.data()); + std::copy(x.begin(), x.end(), xinp.begin()); + + const T* x_data = x.data(); + T* yref_data = yref.data(); + T* xinp_data = xinp.data(); + // test refer code inplace + ref(&a, x_data, yref_data, d); + ref(&a, xinp_data, xinp_data, d); + ExpectEQ(xinp_data, yref_data, d); + + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel, size: " << d; + TestAXYNFunc>(jitcode, a, x, yref); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel, size: " << d; + TestAXYNFunc>(more, a, x, yref); + } + } + } + // Test result from Get function + VLOG(10) << "Test Get function, size: " << d; + auto tgt = jit::Get, PlaceType>(d); + TestAXYNFunc>(tgt, a, x, yref); + } +} + +TEST(JITKernel, vscal) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +TEST(JITKernel, vaddbias) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +TEST(JITKernel, pool) { + // TODO(TJ): add some test +} diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index eaca02ba14..b5ee07e748 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -24,13 +24,6 @@ namespace math { namespace jitkernel { namespace refer { -template -void VAddBias(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] + x[i]; - } -} - template void VRelu(const T* x, T* y, int n) { for (int i = 0; i < n; ++i) { From 06930531887547286f3c4ad096d1fd0794749867 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Tue, 11 Dec 2018 14:43:52 +0800 Subject: [PATCH 274/719] add liscence --- python/paddle/fluid/async_executor.py | 3 ++- python/paddle/fluid/distributed/downpour.py | 13 ++++++++++ python/paddle/fluid/distributed/helper.py | 14 +++++++++++ python/paddle/fluid/distributed/node.py | 13 ++++++++++ .../paddle/fluid/distributed/ps_instance.py | 24 ++++++++++--------- 5 files changed, 55 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 13d876e57b..099805ac1b 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -76,7 +76,7 @@ class AsyncExecutor(object): Note: Only running on CPUPlace supported. """ - def __init__(self, place=None): + def __init__(self, place=None, run_mode=""): if place is None: place = core.CPUPlace() if not isinstance(place, core.CPUPlace): @@ -89,6 +89,7 @@ class AsyncExecutor(object): self.executor = core.AsyncExecutor(scope, p) self.instance = None + def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): """ Run program by this AsyncExecutor. Training dataset will be in filelist. diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index c1762dd768..9ef9e14ccc 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -1,3 +1,16 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + from .node import DownpourServer from .node import DownpourWorker from ..backward import append_backward diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 1244b4c0ca..986525e5d8 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from mpi4py import MPI import ps_pb2 as pslib diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 1f4aeeac73..8755323006 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -1,3 +1,16 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + import ps_pb2 as pslib class Server(object): diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index dce5dfc5bd..b93da053a3 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -1,8 +1,18 @@ -#import paddle.fluid.distributed.helper as dist_helper +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + import helper as dist_helper import sys -#from mpi4py import MPI - class PaddlePSInstance(object): def __init__(self, server_worker_mode, proc_per_node): @@ -83,17 +93,11 @@ class PaddlePSInstance(object): return self._nodes def barrier_all(self): - #print self._rankid, "begin" - #sys.stdout.flush() self.dh.comm.barrier() - #print self._rankid, "end" def barrier_worker(self): if self.is_worker(): - #print "worker: ", self._rankid, "begin" - #sys.stdout.flush() self._comm.barrier() - #print "worker: ", self._rankid, "end" pass def finalize(self): @@ -104,5 +108,3 @@ class PaddlePSInstance(object): if __name__ == "__main__": instance = PaddlePSInstance(1, 1, 2, 50) instance.barrier_all() - #print "-----" - #instance.barrier_worker() From 33ee5cad61383db6bc06681f9f1afa76492a5759 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 10:33:06 +0800 Subject: [PATCH 275/719] format code style of executor_thread_worker.cc --- .../fluid/framework/executor_thread_worker.cc | 77 +++++++++---------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 7004ecf23b..86ac93be3e 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -417,48 +417,46 @@ void AsyncExecutorThreadWorker::PrepareParams() { } void AsyncExecutorThreadWorker::UpdateParams() { - for (auto i: _param_config->sparse_table_id) {//TODO - //for (int i = 0; i < 1; ++i) { - PushSparse(i); - } - //for (auto i = 0u; i < GlobalConfig::instance().dense_table_id.size(); ++i) {//TODO - for (auto i: _param_config->dense_table_id) { - PushDense(i); - } - int32_t tmp_push_dense_wait_times = -1;//_param_config->tmp_push_dense_wait_times; //TODO - int32_t tmp_push_sparse_wait_times = -1;//_param_config->tmp_push_sparse_wait_times; //TODO - static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); - static uint32_t push_sparse_wait_times = static_cast(tmp_push_sparse_wait_times); - - if (_push_dense_status.size() >= push_dense_wait_times) { - for (auto& t : _push_dense_status) { - t.wait(); - } - _push_dense_status.resize(0); - } - if (tmp_push_dense_wait_times == -1) { - _push_dense_status.resize(0); - } - if (_push_sparse_status.size() >= push_sparse_wait_times) { - for (auto& t : _push_sparse_status) { - t.wait(); - } - _push_sparse_status.resize(0); - } - if (tmp_push_sparse_wait_times == -1) { - _push_sparse_status.resize(0); - } - //for (auto dense_table_id : GlobalConfig::instance().dense_table_id) {//TODO - for (auto dense_table_id: _param_config->dense_table_id) { - _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + for (auto i : _param_config->sparse_table_id) { + PushSparse(i); + } + for (auto i : _param_config->dense_table_id) { + PushDense(i); + } + // _param_config->tmp_push_dense_wait_times + int32_t tmp_push_dense_wait_times = -1; + // _param_config->tmp_push_sparse_wait_times + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + + if (_push_dense_status.size() >= push_dense_wait_times) { + for (auto& t : _push_dense_status) { + t.wait(); + } + _push_dense_status.resize(0); + } + if (tmp_push_dense_wait_times == -1) { + _push_dense_status.resize(0); + } + if (_push_sparse_status.size() >= push_sparse_wait_times) { + for (auto& t : _push_sparse_status) { + t.wait(); } - //} + _push_sparse_status.resize(0); + } + if (tmp_push_sparse_wait_times == -1) { + _push_sparse_status.resize(0); + } + for (auto dense_table_id : _param_config->dense_table_id) { + _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + } } void AsyncExecutorThreadWorker::PushDense(int table_id) { std::vector regions; - //auto& variables = GlobalConfig::instance().dense_gradient_variable_name[table_id]; - //std::vector variables; for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { Variable* var = thread_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; @@ -469,7 +467,8 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { regions.emplace_back(std::move(reg)); } - auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), regions.size(), table_id); + auto status = _pslib_ptr->_worker_ptr->push_dense( + regions.data(), regions.size(), table_id); _push_dense_status.push_back(std::move(status)); } @@ -478,7 +477,7 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { auto& features = _features[table_id]; auto& feature_value = _feature_value[table_id]; - auto fea_dim = _param_config->fea_dim; //TODO + auto fea_dim = _param_config->fea_dim; // slot id starts from 1 features.clear(); features.resize(0); From 162637b64abd39c3ca7c75c08690169968305712 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 10:56:08 +0800 Subject: [PATCH 276/719] Fix ngraph compile test=develop --- paddle/fluid/framework/ngraph_operator.cc | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 253de4c611..e2cdfc845f 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -471,27 +471,23 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), "Ensure ngraph tensor layout align with paddle tensor"); - if (tensor_pd->type().hash_code() == - typeid(float).hash_code()) { // NOLINT + if (tensor_pd->type() == proto::VarType::FP32) { const float* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::f32, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(int).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::INT32) { const int* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::i32, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) { + } else if (tensor_pd->type() == proto::VarType::INT64) { const int64_t* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::i64, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(double).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::FP64) { const double* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::f64, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(bool).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::BOOL) { const bool* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::boolean, sp, const_cast(arr)); From c71279bc697a101b9afe74a1e19fc9fb99195bd9 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 11:23:16 +0800 Subject: [PATCH 277/719] refine code style for async_executor.h and async_executor.cc --- paddle/fluid/framework/async_executor.cc | 101 ++++++++++++++--------- paddle/fluid/framework/async_executor.h | 25 +++--- 2 files changed, 79 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 6efe5cafe7..c62d62a5dc 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -66,15 +66,20 @@ void PrepareReaders(std::vector>& readers, // NOLINT } void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { - _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_server(dist_desc, index);//TODO done - + _pslib_ptr = + std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_server(dist_desc, index); InitParamConfig(); } -void AsyncExecutor::InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index) { - _pslib_ptr = std::shared_ptr(new paddle::distributed::PSlib()); - _pslib_ptr->init_worker(dist_desc, host_sign_list.data(), node_num, index);//TODO done +void AsyncExecutor::InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index) { + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_worker( + dist_desc, host_sign_list.data(), node_num, index); InitParamConfig(); } @@ -87,43 +92,65 @@ void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } -void AsyncExecutor::GatherServers(std::vector& host_sign_list, int node_num) { +void AsyncExecutor::GatherServers( + std::vector& host_sign_list, int node_num) { _pslib_ptr->gather_servers(host_sign_list.data(), node_num); } void AsyncExecutor::InitParamConfig() { - for (int i = 0; i < _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param_size(); ++i) { - if (_pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).table_class().find("SparseTable") != -1) { - _param_config.fea_dim = _pslib_ptr->get_param()->server_param().downpour_server_param().downpour_table_param(i).accessor().fea_dim(); //TODO + for (int i = 0; i < + _pslib_ptr->get_param()->server_param().\ + downpour_server_param().\ + downpour_table_param_size(); + ++i) { + if (_pslib_ptr->get_param()->server_param().\ + downpour_server_param().downpour_table_param(i).\ + table_class().find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param()->server_param().\ + downpour_server_param().\ + downpour_table_param(i).\ + accessor().fea_dim(); break; } } - _param_config.slot_dim = _param_config.fea_dim - 2; //TODO - _param_config.tmp_push_dense_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); - _param_config.tmp_push_sparse_wait_times = (int32_t)(_pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); - - for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); ++t) { - _param_config.skip_op.push_back(_pslib_ptr->get_param()->trainer_param().skip_op(t)); + _param_config.slot_dim = _param_config.fea_dim - 2; + _param_config.tmp_push_dense_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + ++t) { + _param_config.skip_op.push_back( + _pslib_ptr->get_param()->trainer_param().skip_op(t)); } - //sparse - for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); + ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); std::vector tmp_sparse_variable_name; for (int i = 0u; i < table.slot_value_size(); ++i) { tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); + _param_config.slot_alias_to_table[table.slot_key(i)] = + table.table_id(); } std::vector tmp_sparse_gradient_variable_name; for (auto i = 0u; i < table.slot_gradient_size(); ++i) { tmp_sparse_gradient_variable_name.push_back( table.slot_gradient(i)); } - _param_config.slot_input_vec[table.table_id()] = std::move(tmp_sparse_variable_name); - _param_config.gradient_var[table.table_id()] = std::move(tmp_sparse_gradient_variable_name); + _param_config.slot_input_vec[table.table_id()] = + std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = + std::move(tmp_sparse_gradient_variable_name); _param_config.sparse_table_id.push_back(table.table_id()); } - //dense - for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); + ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); std::vector tmp_dense_variable_name; for (int i = 0u; i < table.dense_variable_name_size(); ++i) { @@ -134,20 +161,18 @@ void AsyncExecutor::InitParamConfig() { tmp_dense_gradient_variable_name.push_back( table.dense_gradient_variable_name(i)); } - _param_config.dense_variable_name[table.table_id()] = std::move(tmp_dense_variable_name); - _param_config.dense_gradient_variable_name[table.table_id()] = std::move(tmp_dense_gradient_variable_name); + _param_config.dense_variable_name[table.table_id()] = + std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = + std::move(tmp_dense_gradient_variable_name); _param_config.dense_table_id.push_back(table.table_id()); - _param_config.dense_table_size.push_back(table.fea_dim()); //TODO + _param_config.dense_table_size.push_back(table.fea_dim()); } } void AsyncExecutor::InitModel() { - //TODO only rank = 0 do this - //std::vector all_dense_table_id; //TODO - //all_dense_table_id.push_back(0); //done - for (auto table_id: _param_config.dense_table_id) { + for (auto table_id : _param_config.dense_table_id) { std::vector regions; - //std::vector variables; //TODO for (auto& t : _param_config.dense_variable_name[table_id]) { Variable* var = root_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; @@ -169,13 +194,15 @@ void AsyncExecutor::InitModel() { regions.emplace_back(std::move(reg)); } - auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(regions.data(), regions.size(), table_id); + auto push_status = + _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); push_status.wait(); auto status = push_status.get(); if (status != 0) { LOG(FATAL) << "push dense param failed, status[" << status << "]"; exit(-1); - } + } } } @@ -185,7 +212,7 @@ void AsyncExecutor::SaveModel(const std::string& path) { ret = _pslib_ptr->_worker_ptr->save(path, 0); ret.wait(); int32_t feasign_cnt = ret.get(); - if (feasign_cnt == -1) { // TODO should be feasign_cnt < 0, because server bug + if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 LOG(FATAL) << "save model failed"; exit(-1); } @@ -195,13 +222,13 @@ void AsyncExecutor::PrepareDenseThread(const std::string& mode) { if (mode == "mpi") { DensePullThreadParam param; param.ps_client = _pslib_ptr->_worker_ptr;; - param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO + param.threshold = 1; param.training_thread_num = actual_thread_num; param.root_scope = root_scope_; - //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO param.dense_params = &_param_config.dense_variable_name; - _pull_dense_thread = std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread = std::shared_ptr( + new DensePullThread(param)); _pull_dense_thread->start(); } } diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 93010f8a9b..184566dd39 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include // NOLINT @@ -22,8 +23,7 @@ limitations under the License. */ #include // NOLINT #include #include -#include //local_random_engine -#include //local_random_engine +#include // local_random_engine #include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_thread_worker.h" @@ -43,9 +43,10 @@ inline std::default_random_engine& local_random_engine() { struct engine_wrapper_t { std::default_random_engine engine; engine_wrapper_t() { - static std::atomic x(0); - std::seed_seq sseq = {x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; - engine.seed(sseq); + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, + static_cast(current_realtime() * 1000)}; + engine.seed(sseq); } }; thread_local engine_wrapper_t r; @@ -61,18 +62,20 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, - const std::string& mode, + const std::string& mode, const bool debug = false); - //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index); void InitServer(const std::string& dist_desc, int index); - void InitWorker(const std::string& dist_desc, std::vector& host_sign_list, int node_num, int index); - //void ConfigWorker() {} + void InitWorker( + const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index); uint64_t StartServer(); void StopServer(); - void GatherServers(std::vector& host_sign_list, int node_num); + void GatherServers(const std::vector& host_sign_list, int node_num); void InitModel(); void SaveModel(const std::string& path); void InitParamConfig(); + private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -81,6 +84,7 @@ class AsyncExecutor { Scope* root_scope, const int thread_index, const bool debug); void PrepareDenseThread(const std::string& mode); + public: std::shared_ptr _pslib_ptr; std::shared_ptr _pull_dense_thread; @@ -88,6 +92,7 @@ class AsyncExecutor { platform::Place place_; AsyncWorkerParamConfig _param_config; + private: int actual_thread_num; From 3c01cdeff0e11108f816b5f1abe5d71b3e8d153f Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 11:33:21 +0800 Subject: [PATCH 278/719] refine executor_thread_worker.cc & executor_thread_worker.h code style --- .../fluid/framework/executor_thread_worker.cc | 86 ++++--------------- .../fluid/framework/executor_thread_worker.h | 2 - 2 files changed, 15 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 86ac93be3e..592a416d6d 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -303,7 +303,7 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } -//AsyncExecutor +// AsyncExecutor void AsyncExecutorThreadWorker::TrainFiles() { SetDevice(); @@ -330,7 +330,6 @@ void AsyncExecutorThreadWorker::TrainFiles() { print_fetch_var(thread_scope_, fetch_var_names_[i]); } // end for (int i = 0...) } // end while () - LOG(ERROR) << "TRAIN DONE"; } void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { @@ -360,44 +359,12 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { UpdateParams(); } -void AsyncExecutorThreadWorker::BindingSlotVariableMemory() { - /* - std::vector ins_slot_offset(batch_size + 1, 0); - for (auto i = 1u; i <= batch_size; ++i) { - ins_slot_offset[i] += ins_slot_offset[i - 1] + slot_dim; - } - - std::vector tensor_lod(batch_size + 1, 0); - for (auto i = 1u; i <= batch_size; ++i) { - tensor_lod[i] += tensor_lod[i - 1] + 1; - } - - auto& used_slots = reader->get_use_slot_alias(); - slot_input_vec.resize(used_slots.size() - 1); - for (auto slot_idx = 1u; slot_idx < used_slots.size(); ++slot_idx) { - auto var = slot_input_variable_name[slot_idx]; - - auto v = thread_scope->FindVar(var); - CHECK(v != nullptr) << "var[" << var << "] not found"; - - LoDTensor* tensor = v->GetMutable(); - float* tensor_ptr = tensor->mutable_data({batch_size, slot_dim}, platform::CPUPlace()); - memset(tensor_ptr, 0, sizeof(float) * ins_slot_offset.back()); - - LoD data_lod{tensor_lod}; - tensor->set_lod(data_lod); - - slot_input_vec[slot_idx - 1].reset(tensor); - } - */ -} void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* param_config) { _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - //int table_id = 0; //TODO for (auto table_id: _param_config->sparse_table_id) { PullSparse(table_id); for (auto& t : _pull_sparse_status) { @@ -423,9 +390,7 @@ void AsyncExecutorThreadWorker::UpdateParams() { for (auto i : _param_config->dense_table_id) { PushDense(i); } - // _param_config->tmp_push_dense_wait_times int32_t tmp_push_dense_wait_times = -1; - // _param_config->tmp_push_sparse_wait_times int32_t tmp_push_sparse_wait_times = -1; static uint32_t push_dense_wait_times = static_cast(tmp_push_dense_wait_times); @@ -509,17 +474,15 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { pull_feature_value.data(), table_id, features.data(), features.size()); _pull_sparse_status.push_back(std::move(status)); - //to save time auto& push_g = _feature_push_value[table_id]; check_pull_push_memory(features, push_g, fea_dim); - //binding_slot_embed_with_concat(); TODO - collect_feasign_info(table_id); //TODO + collect_feasign_info(table_id); } void AsyncExecutorThreadWorker::FillSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; // TODO - auto fea_dim = _param_config->fea_dim; //TODO + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; auto& fea_value = _feature_value[table_id]; @@ -544,53 +507,35 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { LoD data_lod{tensor_lod}; tensor_emb->set_lod(data_lod); - //float* ptr = tensor_emb->data(); for (auto index = 0u; index < len; ++index){ - //if (_current_train_job.use_cvm_feature()) { - // if (ids[index] == 0u) { - // memcpy(ptr + slot_dim * index, init_value.data(), sizeof(float) * slot_dim); - // continue; - // } - // memcpy(ptr + slot_dim * index, fea_value[fea_idx].data(), sizeof(float) * slot_dim); - // (ptr + slot_dim * index)[0] = log((ptr + slot_dim * index)[0] + 1); - // (ptr + slot_dim * index)[1] = log((ptr + slot_dim * index)[1] + 1) - (ptr + slot_dim * index)[0]; - // fea_idx++; - //} else { - if (ids[index] == 0u) { - memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); - continue; - } - memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); - fea_idx++; - //} + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + fea_idx++; } } } void AsyncExecutorThreadWorker::PushSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; //TODO - auto fea_dim = _param_config->fea_dim;//_current_train_job.fea_dim();TODO + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; - CHECK(features.size() < 1000000) << "features size:" << features.size(); - //std::vector gradient_var; - //auto& gradient_var = GlobalConfig::instance().input_gradient_variable_name; //TODO + CHECK(features.size() < 1000000) << "features size is too big, may be wrong:" << features.size(); auto& push_g = _feature_push_value[table_id]; check_pull_push_memory(features, push_g, fea_dim); CHECK(push_g.size() == features.size() + 1) << "push_g size:" << push_g.size() << " features size:" << features.size(); uint64_t fea_idx = 0u; auto& fea_info = _fea_info[table_id]; - int offset = 0; - //if (!_current_train_job.use_cvm_feature()) { //TODO - offset = 2; - //} + int offset = 2; const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO + // slot_idx = 0 is label for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { LOG(ERROR) << "ERROR slot_idx:" << slot_idx << " name:" << feed_vec[slot_idx]; } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { - LOG(ERROR) << "ERROR continue"; continue; } Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); @@ -609,7 +554,6 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; exit(-1); } - //int len = tensor->lod()[0].back(); int len = tensor->numel(); CHECK(slot_dim * len == g_tensor->numel()) << "len:" << len << " g_numel:" << g_tensor->numel(); CHECK(len == tensor->numel()) << "len:" << len << "t_numel:" << tensor->numel(); diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 0c9a47690b..4e9c2622b0 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -155,7 +155,6 @@ class ExecutorThreadWorker { void SetFetchVarNames(const std::vector& fetch_var_names); virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); virtual void SetPullDenseThread(std::shared_ptr dpt) {}; - virtual void BindingSlotVariableMemory() {}; virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}; private: void CreateThreadScope(const framework::ProgramDesc& program); @@ -191,7 +190,6 @@ public: virtual ~AsyncExecutorThreadWorker() {} void SetPSlibPtr(std::shared_ptr pslib_ptr); void SetPullDenseThread(std::shared_ptr dpt); - void BindingSlotVariableMemory(); void SetParamConfig(AsyncWorkerParamConfig* param_config); void TrainFiles(); void TrainOneNetwork(); From c4cb4142916c92d82b3e0924206aac25db4b8758 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 11:54:17 +0800 Subject: [PATCH 279/719] refine pslib.cmake url to public --- cmake/external/pslib.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 812af5efa2..4d4dc195aa 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -29,8 +29,8 @@ INCLUDE(ExternalProject) SET(PSLIB_PROJECT "extern_pslib") IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) MESSAGE(STATUS "use pre defined download url") - SET(PSLIB_VER "pslib" CACHE STRING "" FORCE) #todo pslib version - SET(PSLIB_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib url + SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/pslib.tar.gz" CACHE STRING "" FORCE) ENDIF() MESSAGE(STATUS "PSLIB_VER: ${PSLIB_VER}, PSLIB_URL: ${PSLIB_URL}") SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") From c59cdf3a243e104992ec2cde1e36cb38d452feb4 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 12:12:21 +0800 Subject: [PATCH 280/719] refine executor_thread_worker.h and executor_thread_worker.cc code style --- .../fluid/framework/executor_thread_worker.cc | 364 +++++++++--------- .../fluid/framework/executor_thread_worker.h | 92 +++-- 2 files changed, 243 insertions(+), 213 deletions(-) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 592a416d6d..412f4a2b6e 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -58,7 +58,8 @@ bool DensePullThread::check_update_param(uint64_t table_id) { { std::lock_guard lock(_mutex_for_version); auto& version = _training_versions[table_id]; - _current_version[table_id] = *(std::min_element(version.begin(), version.end())); + _current_version[table_id] = + *(std::min_element(version.begin(), version.end())); } if (_current_version[table_id] - _last_versions[table_id] < _threshold) { return false; @@ -93,7 +94,8 @@ void DensePullThread::wait_all() { t.wait(); auto status = t.get(); if (status != 0) { - LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; + LOG(WARNING) << "pull dense failed times:" << + ++_pull_dense_fail_times; } } @@ -105,7 +107,8 @@ void DensePullThread::wait_all() { _pull_dense_status.resize(0); } -void DensePullThread::increase_thread_version(int thread_id, uint64_t table_id) { +void DensePullThread::increase_thread_version( + int thread_id, uint64_t table_id) { std::lock_guard lock(_mutex_for_version); _training_versions[table_id][thread_id]++; } @@ -169,10 +172,6 @@ void ExecutorThreadWorker::SetFetchVarNames( fetch_var_names.end()); } -void ExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { - -} - void ExecutorThreadWorker::SetDevice() { #if defined _WIN32 || defined __APPLE__ @@ -332,10 +331,12 @@ void AsyncExecutorThreadWorker::TrainFiles() { } // end while () } -void AsyncExecutorThreadWorker::SetPSlibPtr(std::shared_ptr pslib_ptr) { +void AsyncExecutorThreadWorker::SetPSlibPtr( + std::shared_ptr pslib_ptr) { _pslib_ptr = pslib_ptr; } -void AsyncExecutorThreadWorker::SetPullDenseThread(std::shared_ptr dpt) { +void AsyncExecutorThreadWorker::SetPullDenseThread( + std::shared_ptr dpt) { _pull_dense_thread = dpt; } void AsyncExecutorThreadWorker::TrainOneNetwork() { @@ -347,7 +348,8 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { } bool need_skip = false; for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { - if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { + if (op->Type().find(_param_config->skip_op[t]) != + std::string::npos) { need_skip = true; break; } @@ -359,13 +361,13 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() { UpdateParams(); } - -void AsyncExecutorThreadWorker::SetParamConfig(AsyncWorkerParamConfig* param_config) { +void AsyncExecutorThreadWorker::SetParamConfig( + AsyncWorkerParamConfig* param_config) { _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - for (auto table_id: _param_config->sparse_table_id) { + for (auto table_id : _param_config->sparse_table_id) { PullSparse(table_id); for (auto& t : _pull_sparse_status) { t.wait(); @@ -378,7 +380,7 @@ void AsyncExecutorThreadWorker::PrepareParams() { } _pull_sparse_status.resize(0); - for (auto table_id: _param_config->sparse_table_id) { + for (auto table_id : _param_config->sparse_table_id) { FillSparse(table_id); } } @@ -440,180 +442,198 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { void AsyncExecutorThreadWorker::PullSparse(int table_id) { - auto& features = _features[table_id]; - auto& feature_value = _feature_value[table_id]; - auto fea_dim = _param_config->fea_dim; - // slot id starts from 1 - features.clear(); - features.resize(0); - features.reserve(MAX_FEASIGN_NUM); - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - LoDTensor* tensor = var->GetMutable(); - int64_t* ids = tensor->data(); - int len = tensor->numel(); - for (auto i = 0u; i < len; ++i) { - //todo: current trick - filter feasign=use_slot_mod(bug: datafeed fill use_slot_mod for empty slot) - if (ids[i] == 0u) { - continue; - } - features.push_back(static_cast(ids[i])); - } - } - check_pull_push_memory(features, feature_value, fea_dim); - - std::vector pull_feature_value; - for (auto i = 0u; i < features.size(); ++i) { - pull_feature_value.push_back(feature_value[i].data()); - } - for (int i = 0; i < features.size(); ++i) { + auto& features = _features[table_id]; + auto& feature_value = _feature_value[table_id]; + auto fea_dim = _param_config->fea_dim; + // slot id starts from 1 + features.clear(); + features.resize(0); + features.reserve(MAX_FEASIGN_NUM); + const std::vector& feed_vec = + thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + // todo(colourful-tree): current trick - filter feasign=use_slot_mod( + // bug: datafeed fill use_slot_mod for empty slot) + if (ids[i] == 0u) { + continue; + } + features.push_back(static_cast(ids[i])); } - auto status = _pslib_ptr->_worker_ptr->pull_sparse( - pull_feature_value.data(), table_id, features.data(), features.size()); - _pull_sparse_status.push_back(std::move(status)); - - auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - - collect_feasign_info(table_id); + } + check_pull_push_memory(features, feature_value, fea_dim); + + std::vector pull_feature_value; + for (auto i = 0u; i < features.size(); ++i) { + pull_feature_value.push_back(feature_value[i].data()); + } + + auto status = _pslib_ptr->_worker_ptr->pull_sparse( + pull_feature_value.data(), table_id, features.data(), features.size()); + _pull_sparse_status.push_back(std::move(status)); + + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + + collect_feasign_info(table_id); } void AsyncExecutorThreadWorker::FillSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; - auto fea_dim = _param_config->fea_dim; - auto& features = _features[table_id]; - auto& fea_value = _feature_value[table_id]; - - CHECK(features.size() > 0) << "feature size check failed"; - - auto fea_idx = 0u; - - std::vector init_value(fea_dim); - - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label TODO - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - LoDTensor* tensor = var->GetMutable(); - int64_t* ids = tensor->data(); - int len = tensor->numel(); - Variable* var_emb = thread_scope_->FindVar(_param_config->slot_input_vec[table_id][slot_idx - 1]); - LoDTensor* tensor_emb = var_emb->GetMutable(); - float* ptr = tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); - memset(ptr, 0, sizeof(float) * len * slot_dim); - auto& tensor_lod = tensor->lod()[0]; - - LoD data_lod{tensor_lod}; - tensor_emb->set_lod(data_lod); - - for (auto index = 0u; index < len; ++index){ - if (ids[index] == 0u) { - memcpy(ptr + slot_dim * index, init_value.data() + 2, sizeof(float) * slot_dim); - continue; - } - memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); - fea_idx++; - } + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& fea_value = _feature_value[table_id]; + + CHECK(features.size() > 0) << "feature size check failed"; + + auto fea_idx = 0u; + + std::vector init_value(fea_dim); + + const std::vector& feed_vec = + thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = thread_scope_->FindVar( + _param_config->slot_input_vec[table_id][slot_idx - 1]); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = tensor_emb->mutable_data( + {len, slot_dim}, platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * slot_dim); + auto& tensor_lod = tensor->lod()[0]; + + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + + for (auto index = 0u; index < len; ++index) { + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, + init_value.data() + 2, sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, + fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + fea_idx++; } + } } void AsyncExecutorThreadWorker::PushSparse(int table_id) { - auto slot_dim = _param_config->slot_dim; - auto fea_dim = _param_config->fea_dim; - auto& features = _features[table_id]; - CHECK(features.size() < 1000000) << "features size is too big, may be wrong:" << features.size(); - auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - CHECK(push_g.size() == features.size() + 1) << "push_g size:" << push_g.size() << " features size:" << features.size(); - uint64_t fea_idx = 0u; - auto& fea_info = _fea_info[table_id]; - int offset = 2; - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { - LOG(ERROR) << "ERROR slot_idx:" << slot_idx << " name:" << feed_vec[slot_idx]; - } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { - continue; - } - Variable* g_var = thread_scope_->FindVar(_param_config->gradient_var[table_id][slot_idx - 1]); - CHECK(g_var != nullptr) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; - LoDTensor* g_tensor = g_var->GetMutable(); - if (g_tensor == NULL) { - LOG(ERROR) << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; - exit(-1); - } - float* g = g_tensor->data(); - - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; - LoDTensor* tensor = var->GetMutable(); - if (tensor == NULL) { - LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; - exit(-1); - } - int len = tensor->numel(); - CHECK(slot_dim * len == g_tensor->numel()) << "len:" << len << " g_numel:" << g_tensor->numel(); - CHECK(len == tensor->numel()) << "len:" << len << "t_numel:" << tensor->numel(); - int64_t* ids = tensor->data(); - for (auto id_idx = 0u; id_idx < len; ++id_idx){ - if (ids[id_idx] == 0) { - g += slot_dim; - continue; - } - memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); - push_g[fea_idx][0] = 1.0f; - CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx << " size:" << fea_info.size(); - push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); - g += slot_dim; - fea_idx++; - } + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) << + "push_g size:" << push_g.size() << " features size:" << features.size(); + uint64_t fea_idx = 0u; + auto& fea_info = _fea_info[table_id]; + int offset = 2; + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + if (_param_config->slot_alias_to_table.find( + feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx << + " name:" << feed_vec[slot_idx]; + } else if ( + _param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + continue; } - CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx << " features size:" << features.size(); - CHECK(features.size() > 0); - - std::vector push_g_vec; - for (auto i = 0u; i < features.size(); ++i) { - push_g_vec.push_back(push_g[i].data()); + Variable* g_var = thread_scope_->FindVar( + _param_config->gradient_var[table_id][slot_idx - 1]); + CHECK(g_var != nullptr) << "var[" << + _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == NULL) { + LOG(ERROR) << "var[" << + _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + exit(-1); + } + float* g = g_tensor->data(); + + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; + LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; + exit(-1); + } + int len = tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) << + "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << + len << "t_numel:" << tensor->numel(); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += slot_dim; + continue; + } + memcpy(push_g[fea_idx].data() + offset, + g, sizeof(float) * slot_dim); + push_g[fea_idx][0] = 1.0f; + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << + fea_idx << " size:" << fea_info.size(); + push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); + g += slot_dim; + fea_idx++; } - auto status = _pslib_ptr->_worker_ptr->push_sparse( - table_id, features.data(), (const float**)push_g_vec.data(), features.size()); - _push_sparse_status.push_back(std::move(status)); + } + CHECK(fea_idx == features.size()) << "fea_idx:" << + fea_idx << " features size:" << features.size(); + CHECK_GT(features.size(), 0); + + std::vector push_g_vec; + for (auto i = 0u; i < features.size(); ++i) { + push_g_vec.push_back(push_g[i].data()); + } + auto status = _pslib_ptr->_worker_ptr->push_sparse( + table_id, features.data(), + (const float**)push_g_vec.data(), features.size()); + _push_sparse_status.push_back(std::move(status)); } void AsyncExecutorThreadWorker::collect_feasign_info( - int table_id) { - auto& fea_info = _fea_info[table_id]; - auto& feature = _features[table_id]; - fea_info.resize(feature.size()); - - const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - Variable* var = thread_scope_->FindVar(feed_vec[0]); + int table_id) { + auto& fea_info = _fea_info[table_id]; + auto& feature = _features[table_id]; + fea_info.resize(feature.size()); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + Variable* var = thread_scope_->FindVar(feed_vec[0]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label = tensor->data(); + + int global_index = 0; + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); LoDTensor* tensor = var->GetMutable(); - int64_t* label = tensor->data(); - - int global_index = 0; - for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); - LoDTensor* tensor = var->GetMutable(); - int64_t* ids = tensor->data(); - - int fea_idx = 0; - for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { - for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { - if (ids[fea_idx] == 0u) { - continue; - } - FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; - - fea_info[global_index++] = std::move(info); - } + int64_t* ids = tensor->data(); + + int fea_idx = 0; + for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { + for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { + if (ids[fea_idx] == 0u) { + continue; } + FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; + + fea_info[global_index++] = std::move(info); + } } - CHECK(global_index == feature.size()) << "expect fea info size:" << feature.size() - << " real:" << global_index; + } + CHECK(global_index == feature.size()) << + "expect fea info size:" << feature.size() + << " real:" << global_index; } void AsyncExecutorThreadWorker::check_pull_push_memory( diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 4e9c2622b0..b6c4f950ec 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -35,21 +35,22 @@ const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; void CreateTensor(Variable* var, proto::VarType::Type var_type); struct AsyncWorkerParamConfig { - int slot_dim; - int fea_dim; - int32_t tmp_push_dense_wait_times; - int32_t tmp_push_sparse_wait_times; - - std::vector skip_op; - - std::map> dense_variable_name; - std::map> dense_gradient_variable_name; - std::vector dense_table_id; - std::vector dense_table_size; // fea_dim for each dense table - std::vector sparse_table_id; - std::map> slot_input_vec; //6048slot 6050slot //name - std::map> gradient_var; //6048slot_embed - std::map slot_alias_to_table; //TODO done + int slot_dim; + int fea_dim; + int32_t tmp_push_dense_wait_times; + int32_t tmp_push_sparse_wait_times; + + std::vector skip_op; + + std::map> dense_variable_name; + std::map> dense_gradient_variable_name; + std::vector dense_table_id; + // fea_dim for each dense table + std::vector dense_table_size; + std::vector sparse_table_id; + std::map> slot_input_vec; + std::map> gradient_var; + std::map slot_alias_to_table; }; struct DensePullThreadParam { @@ -62,8 +63,8 @@ struct DensePullThreadParam { }; class DensePullThread { -public: - DensePullThread(DensePullThreadParam& param) : + public: + explicit DensePullThread(const DensePullThreadParam& param) : _running(false) { _ps_client = param.ps_client; _threshold = param.threshold; @@ -96,11 +97,11 @@ public: void pull_dense2(uint64_t table_id); void wait_all(); -private: + private: void run(); bool check_update_param(uint64_t table_id); -private: + private: std::shared_ptr _ps_client; int _thread_num; int _threshold; @@ -153,9 +154,13 @@ class ExecutorThreadWorker { virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); - virtual void SetPSlibPtr(std::shared_ptr pslib_ptr); - virtual void SetPullDenseThread(std::shared_ptr dpt) {}; - virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}; + virtual void SetPSlibPtr( + std::shared_ptr pslib_ptr); + virtual void SetPullDenseThread( + std::shared_ptr dpt) {} + virtual void SetParamConfig( + AsyncWorkerParamConfig * param_config) {} + private: void CreateThreadScope(const framework::ProgramDesc& program); void CreateThreadOperators(const framework::ProgramDesc& program); @@ -178,32 +183,37 @@ class ExecutorThreadWorker { Scope* root_scope_; // a thread scope, father scope is global score which is shared Scope* thread_scope_; - //private: std::vector fetch_var_names_; std::vector> fetch_values_; bool debug_; }; class AsyncExecutorThreadWorker: public ExecutorThreadWorker { -public: - AsyncExecutorThreadWorker(){}; - virtual ~AsyncExecutorThreadWorker() {} - void SetPSlibPtr(std::shared_ptr pslib_ptr); - void SetPullDenseThread(std::shared_ptr dpt); - void SetParamConfig(AsyncWorkerParamConfig* param_config); - void TrainFiles(); - void TrainOneNetwork(); - void PrepareParams(); - void UpdateParams(); - void PullSparse(int table_id); - void FillSparse(int table_id); - void PushSparse(int table_id); - void PushDense(int table_id); - - void check_pull_push_memory(std::vector& features, std::vector& push_g, int dim); - void check_pull_push_memory(std::vector& features, std::vector>& push_g, int dim); + public: + AsyncExecutorThreadWorker() {} + virtual ~AsyncExecutorThreadWorker() {} + void SetPSlibPtr(std::shared_ptr pslib_ptr); + void SetPullDenseThread(std::shared_ptr dpt); + void SetParamConfig(AsyncWorkerParamConfig* param_config); + void TrainFiles(); + void TrainOneNetwork(); + void PrepareParams(); + void UpdateParams(); + void PullSparse(int table_id); + void FillSparse(int table_id); + void PushSparse(int table_id); + void PushDense(int table_id); + + void check_pull_push_memory( + const std::vector& features, + std::vector& push_g, + int dim); + void check_pull_push_memory(const std::vector& features, + std::vector>& push_g, + int dim); void collect_feasign_info(int table_id); -private: + + private: struct FeasignInfo { uint32_t slot; uint32_t ins; From 2912d5311bccc3b89dd32a0e80f48be41ba7d1bc Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 13:21:39 +0800 Subject: [PATCH 281/719] fix code style bug & change pslib.cmake & change Cmakelist adapt pslib --- CMakeLists.txt | 19 +++++++++++++------ cmake/external/pslib.cmake | 11 ++++++----- paddle/fluid/framework/async_executor.cc | 7 +++---- paddle/fluid/framework/async_executor.h | 4 ++-- paddle/fluid/framework/data_feed.cc | 1 + .../fluid/framework/executor_thread_worker.cc | 4 ++-- .../fluid/framework/executor_thread_worker.h | 2 +- 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b5bf6c5b6..c3b4349c8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF) +option(WITH_PSLIB "Compile with pslib support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) @@ -216,9 +217,12 @@ include(external/warpctc) # download, build, install warpctc include(cupti) include(external/gzstream) endif (NOT WIN32) -include(external/libmct) -include(external/pslib_brpc) -include(external/pslib) + +if(WITH_PSLIB) + include(external/libmct) + include(external/pslib_brpc) + include(external/pslib) +endif() if(WITH_DISTRIBUTE) if(WITH_GRPC) @@ -279,11 +283,14 @@ set(EXTERNAL_LIBS protobuf zlib ${PYTHON_LIBRARIES} - pslib - pslib_brpc - libmct ) +if(WITH_PSLIB) + list(APPEND EXTERNAL_LIBS pslib) + list(APPEND EXTERNAL_LIBS pslib_brpc) + list(APPEND EXTERNAL_LIBS libmct) +endif(WITH_PSLIB) + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 4d4dc195aa..3b495d78e2 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -30,9 +30,10 @@ SET(PSLIB_PROJECT "extern_pslib") IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) MESSAGE(STATUS "use pre defined download url") SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) - SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/pslib.tar.gz" CACHE STRING "" FORCE) + SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) + SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -MESSAGE(STATUS "PSLIB_VER: ${PSLIB_VER}, PSLIB_URL: ${PSLIB_URL}") +MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}") SET(PSLIB_DST_DIR "pslib") @@ -50,7 +51,7 @@ INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt "PROJECT(PSLIB)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${PSLIB_VER}/include ${PSLIB_VER}/lib \n" + "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n" " DESTINATION ${PSLIB_DST_DIR})\n") ExternalProject_Add( @@ -58,8 +59,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PSLIB_SOURCE_DIR} DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_VER}.tar.gz - && tar zxvf ${PSLIB_VER}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz + && tar zxvf ${PSLIB_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index c62d62a5dc..8231aff142 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -50,7 +50,6 @@ void AsyncExecutor::CreateThreads( worker->BindingDataFeedMemory(); worker->SetPSlibPtr(_pslib_ptr); worker->SetPullDenseThread(_pull_dense_thread); - worker->BindingSlotVariableMemory(); worker->SetParamConfig(&_param_config); } @@ -79,7 +78,7 @@ void AsyncExecutor::InitWorker(const std::string& dist_desc, _pslib_ptr = std::shared_ptr( new paddle::distributed::PSlib()); _pslib_ptr->init_worker( - dist_desc, host_sign_list.data(), node_num, index); + dist_desc, (uint64_t*)(host_sign_list.data()), node_num, index); InitParamConfig(); } @@ -93,8 +92,8 @@ void AsyncExecutor::StopServer() { } void AsyncExecutor::GatherServers( - std::vector& host_sign_list, int node_num) { - _pslib_ptr->gather_servers(host_sign_list.data(), node_num); + const std::vector& host_sign_list, int node_num) { + _pslib_ptr->gather_servers((uint64_t*)(host_sign_list.data()), node_num); } void AsyncExecutor::InitParamConfig() { diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 184566dd39..16540c2df2 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -43,9 +43,9 @@ inline std::default_random_engine& local_random_engine() { struct engine_wrapper_t { std::default_random_engine engine; engine_wrapper_t() { - static std::atomic x(0); + static std::atomic x(0); std::seed_seq sseq = {x++, x++, x++, - static_cast(current_realtime() * 1000)}; + static_cast(current_realtime() * 1000)}; engine.seed(sseq); } }; diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 851c7eda89..54a00f8ccf 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -68,6 +68,7 @@ bool DataFeed::PickOneFile(std::string* filename) { return false; } *filename = filelist_[file_idx_++]; + LOG(ERROR) << "pick file:" << *filename; return true; } diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 412f4a2b6e..df15a4d293 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -637,7 +637,7 @@ void AsyncExecutorThreadWorker::collect_feasign_info( } void AsyncExecutorThreadWorker::check_pull_push_memory( - std::vector& features, + const std::vector& features, std::vector>& push_g, int dim) { push_g.resize(features.size() + 1); @@ -647,7 +647,7 @@ void AsyncExecutorThreadWorker::check_pull_push_memory( } void AsyncExecutorThreadWorker::check_pull_push_memory( - std::vector& features, + const std::vector& features, std::vector& push_g, int dim) { if (features.size() > push_g.size()) { diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index b6c4f950ec..93373b1d2e 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -155,7 +155,7 @@ class ExecutorThreadWorker { // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); virtual void SetPSlibPtr( - std::shared_ptr pslib_ptr); + std::shared_ptr pslib_ptr) {}; virtual void SetPullDenseThread( std::shared_ptr dpt) {} virtual void SetParamConfig( From 23eb8c4299ce9908d07505df413c4a2b79f14d32 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 14:02:15 +0800 Subject: [PATCH 282/719] fix ci test=develop --- .../framework/details/multi_devices_graph_pass.cc | 10 +++++++--- paddle/fluid/operators/reader/ctr_reader.h | 2 +- paddle/fluid/pybind/pybind.cc | 13 ++++++++++++- .../unittests/test_parallel_executor_dry_run.py | 10 ++++++---- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index e264906b57..6c4e0e9168 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -386,12 +386,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { - // insert synchronous ops at the backpropagation; and - // insert synchronous ops if the graph contains mutilple places. +// insert synchronous ops at the backpropagation; and +// insert synchronous ops if the graph contains mutilple places. + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (!is_forwarding && (places_.size() > 1 || num_trainers > 1 || (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) { +#else + if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { +#endif // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 517d669744..635483158f 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -95,7 +95,7 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, thread_id, &read_thread_status_, queue_))); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 9cebdda693..3beb93e7b3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -789,7 +789,18 @@ All parameter, weight, gradient are variables in Paddle. [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { self.type_ = type; }, - R"DOC()DOC"); + R"DOC(The type is ExecutorType which is the enum ranging from Default, +ParallelGraph and Experiment: + +Default: Compile the main_program into a multi-devices graph, + and execute this graph on multi-devices with multiple threads which + specified by build_strategy.num_threads. +ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one + device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True. + This approach can achieve better performance in some scenarios. +Experimental: Compile the main_program into a multi-devices graph, + and executor this graph with a faster execution mode than the Default, + this approach is on the experiments.)DOC"); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index 18d95c94ad..eff76ce0d4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -17,6 +17,8 @@ import unittest import logging import six +ExecutorType = fluid.ExecutionStrategy().ExecutorType + class TestBase(unittest.TestCase): def main(self, @@ -24,7 +26,7 @@ class TestBase(unittest.TestCase): iter=10, iter_per_pe=10, use_gpu=True, - use_experimental_executor=False): + exec_type=ExecutorType.Default): if use_gpu and not fluid.core.is_compiled_with_cuda(): logging.warning( "Paddle is not compiled with CUDA, skip GPU unittests") @@ -43,7 +45,7 @@ class TestBase(unittest.TestCase): for _ in six.moves.xrange(iter): exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True - exe_strategy.use_experimental_executor = use_experimental_executor + exe_strategy.executor_type = exec_type pe = fluid.ParallelExecutor( use_cuda=use_gpu, loss_name=loss.name, @@ -56,11 +58,11 @@ class TestBase(unittest.TestCase): class TestMNISTDryRun(TestBase): def test_mnist_dry_run(self): for use_gpu in (False, True): - for use_experimental_executor in (False, True): + for exec_type in (ExecutorType.Default, ExecutorType.Experimental): self.main( network_func=TestMNISTDryRun.network_func, use_gpu=use_gpu, - use_experimental_executor=use_experimental_executor) + exec_type=exec_type) @staticmethod def network_func(): From fa1f77e20ca2134f52ab01049a7070a2f0a9a3c8 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 12 Dec 2018 18:14:03 +0800 Subject: [PATCH 283/719] enable ci test=develop --- .../fluid/tests/unittests/test_dist_base.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 26fa20291b..8456651266 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -224,6 +224,7 @@ class TestDistBase(unittest.TestCase): def setUp(self): self._trainers = 2 self._pservers = 2 + self._port_set = set() self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable @@ -238,9 +239,17 @@ class TestDistBase(unittest.TestCase): self._after_setup_config() def _find_free_port(self): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.bind(('', 0)) - return s.getsockname()[1] + def __free_port(): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + while True: + port = __free_port() + if port not in self._port_set: + self._port_set.add(port) + return port def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") From deb0d41cea15db2b24aff269e2f84bd68eeaa919 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 12 Dec 2018 13:33:30 +0000 Subject: [PATCH 284/719] fix cmake fix cmake again test=develop --- paddle/fluid/operators/CMakeLists.txt | 9 +++++--- paddle/fluid/operators/py_func_op.cc | 4 ++-- paddle/fluid/operators/py_func_op.h | 2 +- paddle/fluid/pybind/CMakeLists.txt | 3 +++ python/paddle/fluid/layers/nn.py | 31 ++++++++++----------------- 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9379122faf..23508ebe7c 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -42,8 +42,7 @@ if (WITH_DISTRIBUTE) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) endif() -register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) - +register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) # warpctc_op needs cudnn 7 above if (WITH_GPU AND NOT WIN32) @@ -82,7 +81,7 @@ endif() # op_library(unstack_op DEPS stack_op) # op_library(tensor_array_to_tensor_op DEPS concat_op) -set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS} python pybind) +set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(gather_test SRCS gather_test.cc DEPS tensor) @@ -94,4 +93,8 @@ cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) +if (WITH_PYTHON) + cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) +endif() + set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 32c44c3bc2..90a6433366 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -namespace py = pybind11; +namespace py = ::pybind11; static std::vector g_py_callables; @@ -30,7 +30,7 @@ const char kForwardPythonCallableId[] = "forward_callable_id"; const char kBackwardPythonCallableId[] = "backward_callable_id"; const char kPyFuncBackwardSkipVars[] = "backward_skip_vars"; -size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) { +size_t AppendPythonCallableObjectAndReturnId(const py::object &py_obj) { g_py_callables.emplace_back(py_obj); return g_py_callables.size() - 1; } diff --git a/paddle/fluid/operators/py_func_op.h b/paddle/fluid/operators/py_func_op.h index e85fa6b5bc..4ba06bf598 100644 --- a/paddle/fluid/operators/py_func_op.h +++ b/paddle/fluid/operators/py_func_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -size_t AppendPythonCallableObjectAndReturnId(pybind11::object py_obj); +size_t AppendPythonCallableObjectAndReturnId(const ::pybind11::object &py_obj); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb126..b75790e4fe 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,8 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +if(WITH_PYTHON) + list(APPEND PYBIND_DEPS py_func_op) +endif() set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) if(WITH_PYTHON) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d71368644d..debe0ff0c9 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9173,31 +9173,22 @@ class PyFuncWrapper(object): kwargs[arg] = args[idx] idx += 1 - ret0 = self._func(*args[idx:], **kwargs) - if ret0 is None: - return None - - if not isinstance(ret0, (list, tuple)): - ret0 = (ret0, ) + func_ret = self._func(*args[idx:], **kwargs) + if not isinstance(func_ret, (list, tuple)): + func_ret = (func_ret, ) ret = [] - for i in six.moves.range(len(ret0)): - if ret0[i] is None: - ret.append(None) - continue - - if isinstance(ret0[i], core.LoDTensor): - ret.append(ret0[i]) + for each_ret in func_ret: + if each_ret is None or isinstance(each_ret, core.LoDTensor): + ret.append(each_ret) continue - if isinstance(ret0[i], np.ndarray): - r = ret0[i] - else: - r = np.array(ret0[i]) + if not isinstance(each_ret, np.ndarray): + each_ret = np.array(each_ret) - t = core.LoDTensor() - t.set(r, core.CPUPlace()) - ret.append(t) + tensor = core.LoDTensor() + tensor.set(each_ret, core.CPUPlace()) + ret.append(tensor) return tuple(ret) From affdd70976f1ad2a2de959ceb082b95791961d91 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 14:26:52 +0800 Subject: [PATCH 285/719] fix api.spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2722ea078e..db7b0d34a3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -27,6 +27,7 @@ paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None +paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None From 2328bee1cc835d789b83cd4da9bef6b588bc87c5 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 13 Dec 2018 06:34:09 +0000 Subject: [PATCH 286/719] fix Windows compile bug test=develop --- .../framework/details/eager_deletion_op_handle.cc | 6 +++--- paddle/fluid/framework/executor.cc | 10 ++++++---- paddle/fluid/framework/tensor.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 3b27415e43..abacb11e3b 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -77,14 +77,14 @@ void EagerDeletionOpHandle::RunImpl() { VLOG(2) << "Erase variable " << name; if (var->IsType()) { - garbages.emplace_back(var->GetMutable()->MoveMemory()); + garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); } else if (var->IsType()) { garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemory()); + var->GetMutable()->mutable_value()->MoveMemoryHolder()); } else if (var->IsType()) { auto *tensor_arr = var->GetMutable(); for (auto &t : *tensor_arr) { - garbages.emplace_back(t.MoveMemory()); + garbages.emplace_back(t.MoveMemoryHolder()); } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 16c4552a5f..0c4bd336c5 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -106,14 +106,16 @@ static void DeleteUnusedTensors( VLOG(2) << "Erase variable " << name; if (var->IsType()) { - garbages.emplace_back(var->GetMutable()->MoveMemory()); - } else if (var->IsType()) { garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemory()); + var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); } else if (var->IsType()) { auto* lod_tensor_arr = var->GetMutable(); for (auto& t : *lod_tensor_arr) { - garbages.emplace_back(t.MoveMemory()); + garbages.emplace_back(t.MoveMemoryHolder()); } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 9f7027f5ae..153222506a 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -158,7 +158,7 @@ class Tensor { const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } - std::shared_ptr MoveMemory() { + std::shared_ptr MoveMemoryHolder() { return std::move(holder_); } From 15550a27536012a92e2e7badaee7b41afff31f3e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 14:46:22 +0800 Subject: [PATCH 287/719] Polish code --- cmake/external/python.cmake | 6 +- .../fluid/operators/math/matrix_bit_code.cc | 486 ++++++++++++------ paddle/fluid/operators/math/matrix_bit_code.h | 63 +-- 3 files changed, 340 insertions(+), 215 deletions(-) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index a3599dd798..52ad02a355 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,8 +18,8 @@ ENDIF() INCLUDE(python_module) -FIND_PACKAGE(PythonInterp ${PY_VERSION}) -FIND_PACKAGE(PythonLibs ${PY_VERSION}) +FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED) +FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED) if(WIN32) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" @@ -79,6 +79,6 @@ IF(PYTHONINTERP_FOUND) "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() ENDIF(PYTHONINTERP_FOUND) - +message(STATUS ${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 92affa0e4e..d55e832cc2 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -15,225 +15,379 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include #include + namespace paddle { namespace operators { namespace math { template -void MatrixBitCodeFunctor::Add(const framework::Tensor& vec, - framework::Tensor* tmat) { - size_t batch_size = tmat->dims()[0]; - size_t width = tmat->dims()[1]; - auto* tmat_data = tmat->data(); - auto* vec_data = vec.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - tmat_data[i * width + j] += vec_data[index]; +struct MatrixBitCodeFunctorAdd : public boost::static_visitor { + const framework::Tensor &vec_; + framework::Tensor *tmat_; + + MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat) + : vec_(vec), tmat_(tmat) {} + + template + void operator()(const CodeTable &code_table) { + size_t batch_size = tmat_->dims()[0]; + size_t width = tmat_->dims()[1]; + auto *tmat_data = tmat_->data(); + auto *vec_data = vec_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + tmat_data[i * width + j] += vec_data[index]; + } } } +}; + +template +void MatrixBitCodeFunctor::Add(const framework::Tensor &vec, + framework::Tensor *tmat) { + MatrixBitCodeFunctorAdd func(vec, tmat); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::Tensor* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - auto* vec_data = vec->data(); - auto* tmat_data = tmat.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - vec_data[index] += tmat_data[i * width + j]; +struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *vec_; + MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat, + framework::Tensor *vec) + : tmat_(tmat), vec_(vec) {} + + template + void operator()(const CodeTable &table) { + size_t batch_size = tmat_.dims()[0]; + size_t width = tmat_.dims()[1]; + auto *vec_data = vec_->data(); + auto *tmat_data = tmat_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + vec_data[index] += tmat_data[i * width + j]; + } } } +}; + +template +void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, + framework::Tensor *vec) { + MatrixBitCodeFunctorAddGrad func(tmat, vec); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::SelectedRows* vec) { - size_t batch_size = tmat.dims()[0]; - size_t width = tmat.dims()[1]; - auto* vec_data = vec->mutable_value()->data(); - auto* tmat_data = tmat.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - int64_t row_index = vec->GetIndexFromId(static_cast(index)); - vec_data[row_index] += tmat_data[i * width + j]; +struct MatrixBitCodeFunctorSelectedRowsAddGrad + : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::SelectedRows *vec_; + + MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, + framework::SelectedRows *vec) + : tmat_(tmat), vec_(vec) {} + + template + void operator()(const CodeTable &code_table) { + size_t batch_size = tmat_.dims()[0]; + size_t width = tmat_.dims()[1]; + auto *vec_data = vec_->mutable_value()->template data(); + auto *tmat_data = tmat_.data(); + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + int64_t row_index = vec_->GetIndexFromId(static_cast(index)); + vec_data[row_index] += tmat_data[i * width + j]; + } } } +}; + +template +void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, + framework::SelectedRows *vec) { + MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, - framework::Tensor* sum, T scale_sum) { - size_t num_samples = tmat.dims()[0]; - size_t o_width = tmat.dims()[1]; - auto* tmat_data = tmat.data(); - auto* sum_data = sum->data(); - for (size_t i = 0; i < num_samples; ++i) { - T sm = static_cast(0.0); - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - if (code->calc_bit(j)) { - // calc_bit starts from right most bit, while data in tmat[i] is in the - // reverse order. - sm += tmat_data[i * o_width + j]; +struct MatrixBitCodeFunctorSum : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *sum_; + T scale_sum_; + + MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum, + T scale_sum) + : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {} + + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_.dims()[0]; + size_t o_width = tmat_.dims()[1]; + auto *tmat_data = tmat_.data(); + auto *sum_data = sum_->data(); + for (size_t i = 0; i < num_samples; ++i) { + T sm = static_cast(0.0); + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + // calc_bit starts from right most bit, while data in tmat[i] is in + // the + // reverse order. + sm += tmat_data[i * o_width + j]; + } } + sum_data[i] = scale_sum_ * sm; } - sum_data[i] = scale_sum * sm; } +}; + +template +void MatrixBitCodeFunctor::Sum(const framework::Tensor &tmat, + framework::Tensor *sum, T scale_sum) { + MatrixBitCodeFunctorSum func(tmat, sum, scale_sum); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, - const framework::Tensor& weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat->dims()[0]; - size_t tmat_width = tmat->dims()[1]; - size_t input_width = input.dims()[1]; - size_t weight_width = weight.dims()[1]; - auto tmat_value = tmat->data(); - auto weight_value = weight.data(); - auto input_value = input.data(); - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_row = input_value + input_width * i; - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - const T* weight_row = weight_value + weight_width * index; - T sum = static_cast(0.0); - sum = blas.DOT(input_width, weight_row, input_row); - tmat_value[i * tmat_width + j] += sum; +struct MatrixBitCodeFunctorMul : public boost::static_visitor { + framework::Tensor *tmat_; + const framework::Tensor &weight_; + const framework::Tensor &input_; + + MatrixBitCodeFunctorMul(framework::Tensor *tmat, + const framework::Tensor &weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_->dims()[0]; + size_t tmat_width = tmat_->dims()[1]; + size_t input_width = input_.dims()[1]; + size_t weight_width = weight_.dims()[1]; + auto tmat_value = tmat_->data(); + auto weight_value = weight_.data(); + auto input_value = input_.data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_row = input_value + input_width * i; + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + const T *weight_row = weight_value + weight_width * index; + T sum = blas.DOT(input_width, weight_row, input_row); + tmat_value[i * tmat_width + j] += sum; + } } } +}; + +template +void MatrixBitCodeFunctor::Mul(framework::Tensor *tmat, + const framework::Tensor &weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMul func(tmat, weight, input); + code_table_.apply_visitor(func); } +template +class ReservedVector : public std::vector { + public: + ReservedVector() { this->reserve(N); } +}; + template -void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, - framework::Tensor* weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat.dims()[0]; - size_t input_width = input.dims()[1]; - size_t tmat_width = tmat.dims()[1]; - size_t weight_width = weight->dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight->data(); - auto input_value = input.data(); - - std::map>> ops; - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_value_row = input_value + input_width * i; - const T* tmat_row = tmat_value + i * tmat_width; - for (int j = 0; j < code_length; ++j) { - ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); +struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::Tensor *weight_; + const framework::Tensor &input_; + MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat, + framework::Tensor *weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_.dims()[0]; + size_t input_width = input_.dims()[1]; + size_t tmat_width = tmat_.dims()[1]; + size_t weight_width = weight_->dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_->data(); + auto input_value = input_.data(); + + std::map, 8u>> ops; + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_value_row = input_value + input_width * i; + const T *tmat_row = tmat_value + i * tmat_width; + for (int j = 0; j < code_length; ++j) { + ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } } - } - for (auto& op : ops) { - auto& op_in_row = op.second; - for (auto& pair : op_in_row) { - auto& scale = pair.first; - auto* input_row = pair.second; - T* weight_row = weight_value + op.first * weight_width; - blas.AXPY(input_width, scale, input_row, weight_row); + for (auto &op : ops) { + auto &op_in_row = op.second; + for (auto &pair : op_in_row) { + auto &scale = pair.first; + auto *input_row = pair.second; + T *weight_row = weight_value + op.first * weight_width; + blas.AXPY(input_width, scale, input_row, weight_row); + } } } +}; + +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, + framework::Tensor *weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMulGradWeight func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, - framework::SelectedRows* weight, - const framework::Tensor& input) { - auto blas = - GetBlas(platform::CPUDeviceContext()); - size_t num_samples = tmat.dims()[0]; - size_t input_width = input.dims()[1]; - size_t tmat_width = tmat.dims()[1]; - size_t weight_width = weight->value().dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight->mutable_value()->data(); - auto input_value = input.data(); - - std::unordered_map>> ops; - ops.reserve(weight->rows().size()); - - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - const T* input_value_row = input_value + input_width * i; - const T* tmat_row = tmat_value + i * tmat_width; - for (int j = 0; j < code_length; ++j) { - ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); +struct MatrixBitCodeFunctorMulGradWeightSR + : public boost::static_visitor { + const framework::Tensor &tmat_; + framework::SelectedRows *weight_; + const framework::Tensor &input_; + + MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat, + framework::SelectedRows *weight, + const framework::Tensor &input) + : tmat_(tmat), weight_(weight), input_(input) {} + + template + void operator()(const CodeTable &code_table) { + auto blas = + GetBlas(platform::CPUDeviceContext()); + size_t num_samples = tmat_.dims()[0]; + size_t input_width = input_.dims()[1]; + size_t tmat_width = tmat_.dims()[1]; + size_t weight_width = weight_->value().dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_->mutable_value()->data(); + auto input_value = input_.data(); + + std::unordered_map>> ops; + ops.reserve(weight_->rows().size()); + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + const T *input_value_row = input_value + input_width * i; + const T *tmat_row = tmat_value + i * tmat_width; + for (int j = 0; j < code_length; ++j) { + ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } } - } - for (auto& row : weight->rows()) { - auto& op_in_row = ops[row]; - for (auto& pair : op_in_row) { - auto& scale = pair.first; - auto* input_row = pair.second; - blas.AXPY(input_width, scale, input_row, weight_value); + for (auto &row : weight_->rows()) { + auto &op_in_row = ops[row]; + for (auto &pair : op_in_row) { + auto &scale = pair.first; + auto *input_row = pair.second; + blas.AXPY(input_width, scale, input_row, weight_value); + } + weight_value += weight_width; } - weight_value += weight_width; } +}; + +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, + framework::SelectedRows *weight, + const framework::Tensor &input) { + MatrixBitCodeFunctorMulGradWeightSR func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, - const framework::Tensor& weight, - framework::Tensor* input) { - size_t num_samples = tmat.dims()[0]; - size_t tmat_width = tmat.dims()[1]; - size_t input_width = input->dims()[1]; - size_t weight_width = weight.dims()[1]; - auto tmat_value = tmat.data(); - auto weight_value = weight.data(); - auto input_value = input->data(); - - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - - for (size_t k = 0; k < input_width; ++k) { - input_value[input_width * i + k] += - tmat_value[i * tmat_width + j] * - weight_value[weight_width * index + k]; +struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor { + const framework::Tensor &tmat_; + const framework::Tensor &weight_; + framework::Tensor *input_; + + MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat, + const framework::Tensor &weight, + framework::Tensor *input) + : tmat_(tmat), weight_(weight), input_(input) {} + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_.dims()[0]; + size_t tmat_width = tmat_.dims()[1]; + size_t input_width = input_->dims()[1]; + size_t weight_width = weight_.dims()[1]; + auto tmat_value = tmat_.data(); + auto weight_value = weight_.data(); + auto input_value = input_->data(); + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + + for (size_t k = 0; k < input_width; ++k) { + input_value[input_width * i + k] += + tmat_value[i * tmat_width + j] * + weight_value[weight_width * index + k]; + } } } } +}; + +template +void MatrixBitCodeFunctor::MulGradError(const framework::Tensor &tmat, + const framework::Tensor &weight, + framework::Tensor *input) { + MatrixBitCodeFunctorMulGradError func(tmat, weight, input); + code_table_.apply_visitor(func); } template -void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { - size_t num_samples = tmat->dims()[0]; - size_t o_width = tmat->dims()[1]; - auto* tmat_data = tmat->data(); - for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table_->get_code(i); - int code_length = code->get_length(); - for (int j = 0; j < code_length; ++j) { - if (code->calc_bit(j)) { - tmat_data[i * o_width + j] -= 1; +struct MatrixBitCodeFunctorSub : public boost::static_visitor { + framework::Tensor *tmat_; + + explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {} + + template + void operator()(const CodeTable &code_table) { + size_t num_samples = tmat_->dims()[0]; + size_t o_width = tmat_->dims()[1]; + auto *tmat_data = tmat_->data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table.get_code(i); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + tmat_data[i * o_width + j] -= 1; + } } } } +}; + +template +void MatrixBitCodeFunctor::Sub(framework::Tensor *tmat) { + MatrixBitCodeFunctorSub func(tmat); + code_table_.apply_visitor(func); } template class MatrixBitCodeFunctor; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index cf43ad9d44..01e4889d34 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/variant.h" #if defined(_WIN32) #include @@ -99,24 +100,7 @@ inline int clz(const T& value) { inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } #endif // !_WIN32 -// set a code interface to create multiple code -class Code { - public: - virtual ~Code() {} - virtual size_t calc_index(int bit) const = 0; - virtual bool calc_bit(int bit) const = 0; - virtual int get_length() const = 0; -}; -// set a CodeTable interface to create multiple code table -class CodeTable { - public: - virtual Code* get_code(int64_t code) const = 0; - virtual size_t size() const = 0; - virtual int get_max_code_length() const = 0; - virtual ~CodeTable() {} -}; - -class SimpleCode : public Code { +class SimpleCode { public: SimpleCode(size_t code, size_t num_classes, const int64_t* ids) : c_(static_cast(ids[code]) + num_classes) {} @@ -138,7 +122,7 @@ class SimpleCode : public Code { }; template -class CustomCode : public Code { +class CustomCode { public: CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids, int index) { @@ -155,11 +139,11 @@ class CustomCode : public Code { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const override { return ptable_data_[bit]; } - bool calc_bit(int bit) const override { return pcode_data_[bit]; } + size_t calc_index(int bit) const { return ptable_data_[bit]; } + bool calc_bit(int bit) const { return pcode_data_[bit]; } // NOTE: this function is not thread-safe. - int get_length() const override { + int get_length() const { if (length_ < 0) { auto len = seq_len_; length_ = @@ -177,46 +161,32 @@ class CustomCode : public Code { mutable int length_{-1}; }; -class SimpleCodeTable : public CodeTable { +class SimpleCodeTable { public: SimpleCodeTable(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids) {} - Code* get_code(int64_t code) const { - auto it = codes_.find(code); - if (it != codes_.end()) { - return it->second.get(); - } - auto* result = new SimpleCode(code, num_classes_, ids_); - codes_.emplace(code, std::unique_ptr(result)); - return result; + SimpleCode get_code(int64_t code) const { + return SimpleCode(code, num_classes_, ids_); } size_t size() const { return num_classes_; } int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } private: - mutable std::map> codes_; - size_t num_classes_; const int64_t* ids_; }; template -class CustomCodeTable : public CodeTable { +class CustomCodeTable { public: CustomCodeTable(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids) : ptable_(ptable), pcode_(pcode), ids_(ids) {} - Code* get_code(int64_t code) const { - auto it = codes_.find(code); - if (it != codes_.end()) { - return it->second.get(); - } - auto* result = new CustomCode(ptable_, pcode_, ids_, code); - codes_.emplace(code, std::unique_ptr(result)); - return result; + CustomCode get_code(int64_t code) const { + return CustomCode(ptable_, pcode_, ids_, code); } size_t size() const { return static_cast(ptable_.dims()[1]); } @@ -225,25 +195,26 @@ class CustomCodeTable : public CodeTable { } private: - mutable std::unordered_map> codes_; const framework::Tensor& ptable_; const framework::Tensor& pcode_; const int64_t* ids_; }; +using CodeTable = boost::variant>; + template class MatrixBitCodeFunctor { public: MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids) : num_classes_(num_classes), ids_(ids), - code_table_(new SimpleCodeTable(num_classes, ids)) {} + code_table_(SimpleCodeTable(num_classes, ids)) {} MatrixBitCodeFunctor(const framework::Tensor& ptable, const framework::Tensor& pcode, const int64_t* ids) : num_classes_(static_cast(ptable.dims()[1])), ids_(ids), - code_table_(new CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(ptable, pcode, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -293,7 +264,7 @@ class MatrixBitCodeFunctor { size_t num_classes_; const int64_t* ids_; - std::unique_ptr code_table_; + CodeTable code_table_; }; } // namespace math } // namespace operators From bf951fa737d62b27f1e50b2b3019815b1a6efda9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 06:48:34 +0000 Subject: [PATCH 288/719] add refer vrelu, videntity, vexp, vsigmoid, vtanh and test and benchmark --- paddle/fluid/operators/jit/benchmark.cc | 77 ++++++++++++++ paddle/fluid/operators/jit/helper.cc | 30 +++--- paddle/fluid/operators/jit/kernel_base.h | 13 ++- .../fluid/operators/jit/refer/CMakeLists.txt | 5 + paddle/fluid/operators/jit/refer/refer.cc | 6 ++ paddle/fluid/operators/jit/refer/refer.h | 51 +++++++++ paddle/fluid/operators/jit/test.cc | 100 ++++++++++++++++++ .../fluid/operators/math/jit_kernel_refer.h | 40 ------- 8 files changed, 267 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 2ad87e414b..a7e5eb6cf4 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -201,6 +201,77 @@ void BenchAXYNKernel() { } } +// return this function avg time +template +double BenchXYNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, + std::vector& y) { // NOLINT + const T* x_data = x.data(); + T* y_data = y.data(); + const int d = y.size(); + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(x_data, y_data, d); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(x_data, y_data, d); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchXYNKernel() { + namespace jit = paddle::operators::jit; + for (int d : TestSizes()) { + std::vector> infos; + std::vector x(d), y(d); + RandomVec(d, x.data()); + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchXYNFunc>(refer, x, y); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + auto res = BenchXYNFunc>(jitcode, x, y); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>(impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + auto res = BenchXYNFunc>(more, x, y); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(d); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchXYNFunc>(tgt, x, y); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -222,4 +293,10 @@ int main(int argc, char* argv[]) { BenchAXYNKernel(); BenchAXYNKernel(); + + BenchXYNKernel(); + BenchXYNKernel(); + BenchXYNKernel(); + BenchXYNKernel(); + BenchXYNKernel(); } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index c9aaffb8b8..c010b64c9c 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -19,28 +19,30 @@ namespace paddle { namespace operators { namespace jit { +#define ONE_CASE(key) \ + case key: \ + return #key + const char* to_string(KernelType kt) { switch (kt) { - case vmul: - return "vmul"; - case vadd: - return "vadd"; - case vaddrelu: - return "vaddrelu"; - case vsub: - return "vsub"; - case vscal: - return "vscal"; - case vexp: - return "vexp"; - case vaddbias: - return "vaddbias"; + ONE_CASE(vmul); + ONE_CASE(vadd); + ONE_CASE(vaddrelu); + ONE_CASE(vsub); + ONE_CASE(vscal); + ONE_CASE(vaddbias); + ONE_CASE(vrelu); + ONE_CASE(videntity); + ONE_CASE(vexp); + ONE_CASE(vsigmoid); + ONE_CASE(vtanh); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; } return nullptr; } +#undef ONE_CASE } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 74ecf3dade..29b881b754 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -26,7 +26,11 @@ typedef enum { vsub, vscal, vaddbias, - vexp + vrelu, + videntity, + vexp, + vsigmoid, + vtanh } KernelType; template @@ -39,6 +43,13 @@ struct XYZNTuples { template struct AXYNTuples : public XYZNTuples {}; +template +struct XYNTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, T*, int); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index afe3f6ca0f..dc07ddb914 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -13,3 +13,8 @@ USE_JITKERNEL_REFER(vaddrelu) USE_JITKERNEL_REFER(vsub) USE_JITKERNEL_REFER(vscal) USE_JITKERNEL_REFER(vaddbias) +USE_JITKERNEL_REFER(vrelu) +USE_JITKERNEL_REFER(videntity) +USE_JITKERNEL_REFER(vexp) +USE_JITKERNEL_REFER(vsigmoid) +USE_JITKERNEL_REFER(vtanh) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 4e9c530344..f716ca89c5 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -29,4 +29,10 @@ REGISTER_REFER_KERNEL(vsub, VSub); REGISTER_REFER_KERNEL(vscal, VScal); REGISTER_REFER_KERNEL(vaddbias, VAddBias); +REGISTER_REFER_KERNEL(vrelu, VRelu); +REGISTER_REFER_KERNEL(videntity, VIdentity); +REGISTER_REFER_KERNEL(vexp, VExp); +REGISTER_REFER_KERNEL(vsigmoid, VSigmoid); +REGISTER_REFER_KERNEL(vtanh, VTanh); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 32ac5bf2d7..7ef60a2d53 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -66,6 +66,50 @@ void VAddBias(const T* a, const T* x, T* y, int n) { } } +template +void VRelu(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template +inline void VIdentity(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i]; + } +} + +template +void VExp(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +template +void VSigmoid(const T* x, T* y, int n) { + // y = 1 / (1 + e^-x) + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); + } +} + +template +void VTanh(const T* x, T* y, int n) { + // y = 2 * sigmoid(2x) - 1 + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -83,6 +127,13 @@ DECLARE_REFER_KERNEL(VSub, XYZNTuples); DECLARE_REFER_KERNEL(VScal, AXYNTuples); DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); +// const T* x, T* y, int n +DECLARE_REFER_KERNEL(VRelu, XYNTuples); +DECLARE_REFER_KERNEL(VIdentity, XYNTuples); +DECLARE_REFER_KERNEL(VExp, XYNTuples); +DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); +DECLARE_REFER_KERNEL(VTanh, XYNTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index ea2cb7b7a4..4c9b853b6e 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -250,6 +250,106 @@ TEST(JITKernel, vaddbias) { TestAXYNKernel(); } +template +void TestXYNFunc(const typename KernelTuples::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); +} + +template +void TestXYNKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int d : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + + std::vector x(d), yref(d); + std::vector xinp(d); // inplace test + RandomVec(d, x.data()); + std::copy(x.begin(), x.end(), xinp.begin()); + + const T* x_data = x.data(); + T* yref_data = yref.data(); + T* xinp_data = xinp.data(); + // test refer code inplace + ref(x_data, yref_data, d); + ref(xinp_data, xinp_data, d); + ExpectEQ(xinp_data, yref_data, d); + + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(d); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel, size: " << d; + TestXYNFunc>(jitcode, x, yref); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>(impl.get()); + if (i && i->UseMe(d)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel, size: " << d; + TestXYNFunc>(more, x, yref); + } + } + } + // Test result from Get function + VLOG(10) << "Test Get function, size: " << d; + auto tgt = jit::Get, PlaceType>(d); + TestXYNFunc>(tgt, x, yref); + } +} + +TEST(JITKernel, vrelu) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, videntity) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vexp) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vsigmoid) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vtanh) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + TEST(JITKernel, pool) { // TODO(TJ): add some test } diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index b5ee07e748..a03e851de5 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -24,46 +24,6 @@ namespace math { namespace jitkernel { namespace refer { -template -void VRelu(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - -template -inline void VIdentity(const T* x, T* y, int n) {} - -template -void VExp(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - -template -void VSigmoid(const T* x, T* y, int n) { - // y = 1 / (1 + e^-x) - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); - } -} - -template -void VTanh(const T* x, T* y, int n) { - // y = 2 * sigmoid(2x) - 1 - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoid(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - template void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT if (type == "sigmoid") { From 1bac8f918c9fca90db7e95dd2f27d7946ffebc40 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 15:00:20 +0800 Subject: [PATCH 289/719] fix api.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index db7b0d34a3..6c6026911b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,8 +26,8 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None From f81957a7531d7cdb4e4f0a96c0d0f5f8752c92b7 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 15:12:37 +0800 Subject: [PATCH 290/719] refine cmake for pslib & pre_define --- CMakeLists.txt | 2 +- cmake/configure.cmake | 4 ++++ cmake/external/libmct.cmake | 13 +++++++------ cmake/external/pslib_brpc.cmake | 15 ++++++++------- paddle/fluid/framework/CMakeLists.txt | 7 ++++++- paddle/fluid/framework/async_executor.cc | 14 ++++++++++++++ paddle/fluid/framework/async_executor.h | 11 +++++++---- paddle/fluid/framework/executor_thread_worker.cc | 6 +++++- paddle/fluid/framework/executor_thread_worker.h | 12 ++++++++++-- paddle/fluid/pybind/async_executor_py.cc | 11 +++++++++++ 10 files changed, 73 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3b4349c8c..68eb8718ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,7 +222,7 @@ if(WITH_PSLIB) include(external/libmct) include(external/pslib_brpc) include(external/pslib) -endif() +endif(WITH_PSLIB) if(WITH_DISTRIBUTE) if(WITH_GRPC) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4e17ddee73..03076c44c3 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -84,6 +84,10 @@ if(NOT WITH_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG) endif(NOT WITH_GOLANG) +if(WITH_PSLIB) + add_definitions(-DPADDLE_WITH_PSLIB) +endif() + if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index 239183cb6d..27cff8cfb6 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -29,10 +29,11 @@ INCLUDE(ExternalProject) SET(LIBMCT_PROJECT "extern_libmct") IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) MESSAGE(STATUS "use pre defined download url") - SET(LIBMCT_VER "libmct" CACHE STRING "" FORCE) #todo libmct version - SET(LIBMCT_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${LIBMCT_VER}.tar.gz" CACHE STRING "" FORCE) #todo libmct url + SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE) + SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE) + SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -MESSAGE(STATUS "LIBMCT_VER: ${LIBMCT_VER}, LIBMCT_URL: ${LIBMCT_URL}") +MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}") SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct") SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}") SET(LIBMCT_DST_DIR "libmct") @@ -47,7 +48,7 @@ INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt "PROJECT(LIBMCT)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${LIBMCT_VER}/include ${LIBMCT_VER}/lib \n" + "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n" " DESTINATION ${LIBMCT_DST_DIR})\n") ExternalProject_Add( @@ -55,8 +56,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${LIBMCT_SOURCE_DIR} DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_VER}.tar.gz - && tar zxvf ${LIBMCT_VER}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz + && tar zxvf ${LIBMCT_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 92019eef26..7ff5a8aca1 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -27,12 +27,13 @@ ENDIF() INCLUDE(ExternalProject) SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") -IF((NOT DEFINED PSLIB_BRPC_VER) OR (NOT DEFINED PSLIB_BRPC_URL)) +IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL)) MESSAGE(STATUS "use pre defined download url") - SET(PSLIB_BRPC_VER "pslib_brpc" CACHE STRING "" FORCE) #todo pslib version - SET(PSLIB_BRPC_URL "http://bjyz-heqiaozhi-dev-new.epc.baidu.com:8000/${PSLIB_BRPC_VER}.tar.gz" CACHE STRING "" FORCE) #todo pslib_brpc url + SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -MESSAGE(STATUS "PSLIB_BRPC_VER: ${PSLIB_BRPC_VER}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc") SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}") SET(PSLIB_BRPC_DST_DIR "pslib_brpc") @@ -50,7 +51,7 @@ INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt "PROJECT(PSLIB_BRPC)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${PSLIB_BRPC_VER}/include ${PSLIB_BRPC_VER}/lib \n" + "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n" " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") ExternalProject_Add( @@ -58,8 +59,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PSLIB_BRPC_SOURCE_DIR} DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_VER}.tar.gz - && tar zxvf ${PSLIB_BRPC_VER}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz + && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6fdc73e93a..f3d66cd883 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -180,7 +180,12 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) +if(WITH_PSLIB) + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) +else() + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +endif(WITH_PSLIB) + cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 8231aff142..fe6488f4b6 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -29,7 +29,9 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" +#ifdef PADDLE_WITH_PSLIB #include "pslib.h" +#endif namespace paddle { namespace framework { @@ -48,9 +50,11 @@ void AsyncExecutor::CreateThreads( worker->SetDataFeed(reader); worker->SetFetchVarNames(fetch_var_names); worker->BindingDataFeedMemory(); +#ifdef PADDLE_WITH_PSLIB worker->SetPSlibPtr(_pslib_ptr); worker->SetPullDenseThread(_pull_dense_thread); worker->SetParamConfig(&_param_config); +#endif } void PrepareReaders(std::vector>& readers, // NOLINT @@ -64,6 +68,7 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } +#ifdef PADDLE_WITH_PSLIB void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { _pslib_ptr = std::shared_ptr( @@ -231,6 +236,7 @@ void AsyncExecutor::PrepareDenseThread(const std::string& mode) { _pull_dense_thread->start(); } } +#endif void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::string& data_feed_desc_str, @@ -279,15 +285,21 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); +#ifdef PADDLE_WITH_PSLIB PrepareDenseThread(mode); +#endif std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { +#ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { worker.reset(new AsyncExecutorThreadWorker); } else { worker.reset(new ExecutorThreadWorker); } +#else + worker.reset(new ExecutorThreadWorker); +#endif } // prepare thread resource here @@ -306,9 +318,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } +#ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { _pull_dense_thread->stop(); } +#endif root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 16540c2df2..d6f16d9133 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -64,6 +64,7 @@ class AsyncExecutor { const std::vector& fetch_names, const std::string& mode, const bool debug = false); +#ifdef PADDLE_WITH_PSLIB void InitServer(const std::string& dist_desc, int index); void InitWorker( const std::string& dist_desc, @@ -75,7 +76,7 @@ class AsyncExecutor { void InitModel(); void SaveModel(const std::string& path); void InitParamConfig(); - +#endif private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -83,16 +84,18 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); +#ifdef PADDLE_WITH_PSLIB void PrepareDenseThread(const std::string& mode); - +#endif public: +#ifdef PADDLE_WITH_PSLIB std::shared_ptr _pslib_ptr; std::shared_ptr _pull_dense_thread; + AsyncWorkerParamConfig _param_config; +#endif Scope* root_scope_; platform::Place place_; - AsyncWorkerParamConfig _param_config; - private: int actual_thread_num; diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index df15a4d293..a58c269220 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -31,6 +31,7 @@ limitations under the License. */ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB int DensePullThread::start() { _running = true; _t = std::thread(&DensePullThread::run, this); @@ -112,7 +113,8 @@ void DensePullThread::increase_thread_version( std::lock_guard lock(_mutex_for_version); _training_versions[table_id][thread_id]++; } - +#endif + void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); op_names_.clear(); @@ -302,6 +304,7 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } +#ifdef PADDLE_WITH_PSLIB // AsyncExecutor void AsyncExecutorThreadWorker::TrainFiles() { SetDevice(); @@ -659,6 +662,7 @@ void AsyncExecutorThreadWorker::check_pull_push_memory( } } } +#endif } // einit_modelnd namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 93373b1d2e..c23eb09470 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -25,14 +25,16 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_PSLIB #include "pslib.h" +#endif namespace paddle { namespace framework { -const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; - void CreateTensor(Variable* var, proto::VarType::Type var_type); +#ifdef PADDLE_WITH_PSLIB +const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; struct AsyncWorkerParamConfig { int slot_dim; @@ -130,6 +132,8 @@ class DensePullThread { float _total_batch_num = 0; }; +#endif + class ExecutorThreadWorker { public: ExecutorThreadWorker() @@ -154,12 +158,14 @@ class ExecutorThreadWorker { virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); +#ifdef PADDLE_WITH_PSLIB virtual void SetPSlibPtr( std::shared_ptr pslib_ptr) {}; virtual void SetPullDenseThread( std::shared_ptr dpt) {} virtual void SetParamConfig( AsyncWorkerParamConfig * param_config) {} +#endif private: void CreateThreadScope(const framework::ProgramDesc& program); @@ -188,6 +194,7 @@ class ExecutorThreadWorker { bool debug_; }; +#ifdef PADDLE_WITH_PSLIB class AsyncExecutorThreadWorker: public ExecutorThreadWorker { public: AsyncExecutorThreadWorker() {} @@ -238,6 +245,7 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker { AsyncWorkerParamConfig* _param_config; }; +#endif } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 8dfba0d269..71a0e256e4 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -41,6 +41,7 @@ namespace pd = paddle::framework; namespace paddle { namespace pybind { using set_name_func = void (pd::DataFeedDesc::*)(const std::string&); +#ifdef PADDLE_WITH_PSLIB void BindAsyncExecutor(py::module* m) { py::class_(*m, "AsyncExecutor") .def(py::init([](framework::Scope* scope, const platform::Place& place) { @@ -56,5 +57,15 @@ void BindAsyncExecutor(py::module* m) { .def("init_model", &framework::AsyncExecutor::InitModel) .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor +#else +void BindAsyncExecutor(py::module* m) { + py::class_(*m, "AsyncExecutor") + .def(py::init([](framework::Scope* scope, const platform::Place& place) { + return std::unique_ptr( + new framework::AsyncExecutor(scope, place)); + })) + .def("run_from_files", &framework::AsyncExecutor::RunFromFile) +} // end BindAsyncExecutor +#endif } // end namespace pybind } // end namespace paddle From aa38fc4ce5cb73e01b614ff57fae9553dcf30abf Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 15:20:40 +0800 Subject: [PATCH 291/719] Fix compile test=develop --- paddle/fluid/inference/tests/api/tester_helper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 8209a049f4..4c8bce4600 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -373,7 +373,7 @@ static bool CompareTensorData(const framework::LoDTensor &a, } for (size_t i = 0; i < a_size; i++) { - if (a.type() == typeid(float)) { + if (a.type() == framework::proto::VarType::FP32) { const auto *a_data = a.data(); const auto *b_data = b.data(); if (std::abs(a_data[i] - b_data[i]) > 1e-3) { @@ -382,7 +382,7 @@ static bool CompareTensorData(const framework::LoDTensor &a, b_data[i]); return false; } - } else if (a.type() == typeid(int64_t)) { + } else if (a.type() == framework::proto::VarType::INT64) { const auto *a_data = a.data(); const auto *b_data = b.data(); if (std::abs(a_data[i] - b_data[i]) > 1e-3) { From 95b887c4f26c794e2b01daa5c97b32582de7c56a Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Thu, 13 Dec 2018 15:30:31 +0800 Subject: [PATCH 292/719] remove commit --- paddle/fluid/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index d980b36d9b..6b526f0103 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) -#add_subdirectory(distributed) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) From c9b799896e6b78a4248cd8c9288ab6adacf628ad Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 15:33:45 +0800 Subject: [PATCH 293/719] fix tag in async_executor --- paddle/fluid/framework/async_executor.cc | 238 ++++++++--------- paddle/fluid/framework/async_executor.h | 5 +- .../fluid/framework/executor_thread_worker.cc | 249 +++++++++--------- .../fluid/framework/executor_thread_worker.h | 178 ++++++------- 4 files changed, 336 insertions(+), 334 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index fe6488f4b6..0fe7f3bd5c 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -102,139 +102,139 @@ void AsyncExecutor::GatherServers( } void AsyncExecutor::InitParamConfig() { - for (int i = 0; i < - _pslib_ptr->get_param()->server_param().\ - downpour_server_param().\ - downpour_table_param_size(); - ++i) { - if (_pslib_ptr->get_param()->server_param().\ - downpour_server_param().downpour_table_param(i).\ - table_class().find("SparseTable") != -1) { - _param_config.fea_dim = _pslib_ptr->get_param()->server_param().\ - downpour_server_param().\ - downpour_table_param(i).\ - accessor().fea_dim(); - break; - } + for (int i = 0; i < + _pslib_ptr->get_param()->server_param(). \ + downpour_server_param(). \ + downpour_table_param_size(); + ++i) { + if (_pslib_ptr->get_param()->server_param(). \ + downpour_server_param().downpour_table_param(i). \ + table_class().find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param()->server_param(). \ + downpour_server_param(). \ + downpour_table_param(i). \ + accessor().fea_dim(); + break; } - _param_config.slot_dim = _param_config.fea_dim - 2; - _param_config.tmp_push_dense_wait_times = static_cast( - _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); - _param_config.tmp_push_sparse_wait_times = static_cast( - _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); - ++t) { - _param_config.skip_op.push_back( - _pslib_ptr->get_param()->trainer_param().skip_op(t)); + } + _param_config.slot_dim = _param_config.fea_dim - 2; + _param_config.tmp_push_dense_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + ++t) { + _param_config.skip_op.push_back( + _pslib_ptr->get_param()->trainer_param().skip_op(t)); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); + ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); + std::vector tmp_sparse_variable_name; + for (int i = 0u; i < table.slot_value_size(); ++i) { + tmp_sparse_variable_name.push_back(table.slot_value(i)); + _param_config.slot_alias_to_table[table.slot_key(i)] = + table.table_id(); } - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); - ++t) { - auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); - std::vector tmp_sparse_variable_name; - for (int i = 0u; i < table.slot_value_size(); ++i) { - tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_key(i)] = - table.table_id(); - } - std::vector tmp_sparse_gradient_variable_name; - for (auto i = 0u; i < table.slot_gradient_size(); ++i) { - tmp_sparse_gradient_variable_name.push_back( - table.slot_gradient(i)); - } - _param_config.slot_input_vec[table.table_id()] = - std::move(tmp_sparse_variable_name); - _param_config.gradient_var[table.table_id()] = - std::move(tmp_sparse_gradient_variable_name); - _param_config.sparse_table_id.push_back(table.table_id()); + std::vector tmp_sparse_gradient_variable_name; + for (auto i = 0u; i < table.slot_gradient_size(); ++i) { + tmp_sparse_gradient_variable_name.push_back( + table.slot_gradient(i)); } - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); - ++t) { - auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); - std::vector tmp_dense_variable_name; - for (int i = 0u; i < table.dense_variable_name_size(); ++i) { - tmp_dense_variable_name.push_back(table.dense_variable_name(i)); - } - std::vector tmp_dense_gradient_variable_name; - for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { - tmp_dense_gradient_variable_name.push_back( - table.dense_gradient_variable_name(i)); - } - _param_config.dense_variable_name[table.table_id()] = - std::move(tmp_dense_variable_name); - _param_config.dense_gradient_variable_name[table.table_id()] = - std::move(tmp_dense_gradient_variable_name); - _param_config.dense_table_id.push_back(table.table_id()); - _param_config.dense_table_size.push_back(table.fea_dim()); + _param_config.slot_input_vec[table.table_id()] = + std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = + std::move(tmp_sparse_gradient_variable_name); + _param_config.sparse_table_id.push_back(table.table_id()); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); + ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); + std::vector tmp_dense_variable_name; + for (int i = 0u; i < table.dense_variable_name_size(); ++i) { + tmp_dense_variable_name.push_back(table.dense_variable_name(i)); + } + std::vector tmp_dense_gradient_variable_name; + for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { + tmp_dense_gradient_variable_name.push_back( + table.dense_gradient_variable_name(i)); } + _param_config.dense_variable_name[table.table_id()] = + std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = + std::move(tmp_dense_gradient_variable_name); + _param_config.dense_table_id.push_back(table.table_id()); + _param_config.dense_table_size.push_back(table.fea_dim()); + } } void AsyncExecutor::InitModel() { - for (auto table_id : _param_config.dense_table_id) { - std::vector regions; - for (auto& t : _param_config.dense_variable_name[table_id]) { - Variable* var = root_scope_->FindVar(t); - CHECK(var != nullptr) << "var[" << t << "] not found"; - LoDTensor* tensor = var->GetMutable(); - - float* g = tensor->data(); - CHECK(g != nullptr) << "var[" << t << "] value not initialized"; - - float init_range = 0.2; - int rown = tensor->dims()[0]; - init_range /= sqrt(rown); - - std::normal_distribution ndistr(0.0, 1.0); - for (auto i = 0u; i < tensor->numel(); ++i) { - g[i] = ndistr(local_random_engine()) * init_range; - } - - paddle::ps::Region reg(g, tensor->numel()); - regions.emplace_back(std::move(reg)); - } - - auto push_status = - _pslib_ptr->_worker_ptr->push_dense_param( - regions.data(), regions.size(), table_id); - push_status.wait(); - auto status = push_status.get(); - if (status != 0) { - LOG(FATAL) << "push dense param failed, status[" << status << "]"; - exit(-1); - } + for (auto table_id : _param_config.dense_table_id) { + std::vector regions; + for (auto& t : _param_config.dense_variable_name[table_id]) { + Variable* var = root_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + + float* g = tensor->data(); + CHECK(g != nullptr) << "var[" << t << "] value not initialized"; + + float init_range = 0.2; + int rown = tensor->dims()[0]; + init_range /= sqrt(rown); + + std::normal_distribution ndistr(0.0, 1.0); + for (auto i = 0u; i < tensor->numel(); ++i) { + g[i] = ndistr(local_random_engine()) * init_range; + } + + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); } + + auto push_status = + _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + if (status != 0) { + LOG(FATAL) << "push dense param failed, status[" << status << "]"; + exit(-1); + } + } } void AsyncExecutor::SaveModel(const std::string& path) { - auto ret = _pslib_ptr->_worker_ptr->flush(); - ret.wait(); - ret = _pslib_ptr->_worker_ptr->save(path, 0); - ret.wait(); - int32_t feasign_cnt = ret.get(); - if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 - LOG(FATAL) << "save model failed"; - exit(-1); - } + auto ret = _pslib_ptr->_worker_ptr->flush(); + ret.wait(); + ret = _pslib_ptr->_worker_ptr->save(path, 0); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 + LOG(FATAL) << "save model failed"; + exit(-1); + } } void AsyncExecutor::PrepareDenseThread(const std::string& mode) { - if (mode == "mpi") { - DensePullThreadParam param; - param.ps_client = _pslib_ptr->_worker_ptr;; - param.threshold = 1; - param.training_thread_num = actual_thread_num; - param.root_scope = root_scope_; - param.dense_params = &_param_config.dense_variable_name; - - _pull_dense_thread = std::shared_ptr( - new DensePullThread(param)); - _pull_dense_thread->start(); - } + if (mode == "mpi") { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr;; + param.threshold = 1; + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + param.dense_params = &_param_config.dense_variable_name; + + _pull_dense_thread = std::shared_ptr( + new DensePullThread(param)); + _pull_dense_thread->start(); + } } #endif diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index d6f16d9133..1264212641 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -45,7 +45,8 @@ inline std::default_random_engine& local_random_engine() { engine_wrapper_t() { static std::atomic x(0); std::seed_seq sseq = {x++, x++, x++, - static_cast(current_realtime() * 1000)}; + static_cast( + current_realtime() * 1000)}; engine.seed(sseq); } }; @@ -77,6 +78,7 @@ class AsyncExecutor { void SaveModel(const std::string& path); void InitParamConfig(); #endif + private: void CreateThreads(ExecutorThreadWorker* worker, const ProgramDesc& main_program, @@ -87,6 +89,7 @@ class AsyncExecutor { #ifdef PADDLE_WITH_PSLIB void PrepareDenseThread(const std::string& mode); #endif + public: #ifdef PADDLE_WITH_PSLIB std::shared_ptr _pslib_ptr; diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index a58c269220..59679842bc 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -33,87 +33,87 @@ namespace framework { #ifdef PADDLE_WITH_PSLIB int DensePullThread::start() { - _running = true; - _t = std::thread(&DensePullThread::run, this); - return 0; + _running = true; + _t = std::thread(&DensePullThread::run, this); + return 0; } void DensePullThread::run() { - while (_running) { - _pull_dense_status.resize(0); - for (auto& t : _dense_variable_name) { - if (check_update_param(t.first)) { - auto status = pull_dense(t.first); - _pull_dense_status.emplace_back(std::move(status)); - reset_thread_version(t.first); - } - } - if (_pull_dense_status.size() != 0) { - wait_all(); - } - - usleep(_sleep_time_ms * 1000); + while (_running) { + _pull_dense_status.resize(0); + for (auto& t : _dense_variable_name) { + if (check_update_param(t.first)) { + auto status = pull_dense(t.first); + _pull_dense_status.emplace_back(std::move(status)); + reset_thread_version(t.first); + } + } + if (_pull_dense_status.size() != 0) { + wait_all(); } + + usleep(_sleep_time_ms * 1000); + } } bool DensePullThread::check_update_param(uint64_t table_id) { - { - std::lock_guard lock(_mutex_for_version); - auto& version = _training_versions[table_id]; - _current_version[table_id] = - *(std::min_element(version.begin(), version.end())); - } - if (_current_version[table_id] - _last_versions[table_id] < _threshold) { - return false; - } - return true; + { + std::lock_guard lock(_mutex_for_version); + auto& version = _training_versions[table_id]; + _current_version[table_id] = + *(std::min_element(version.begin(), version.end())); + } + if (_current_version[table_id] - _last_versions[table_id] < _threshold) { + return false; + } + return true; } void DensePullThread::reset_thread_version(uint64_t table_id) { - std::lock_guard lock(_mutex_for_version); - _last_versions[table_id] = _current_version[table_id]; + std::lock_guard lock(_mutex_for_version); + _last_versions[table_id] = _current_version[table_id]; } std::future DensePullThread::pull_dense(uint64_t table_id) { - auto& regions = _regions[table_id]; - regions.clear(); - auto& variables = _dense_variable_name[table_id]; - regions.resize(variables.size()); - - for (auto i = 0u; i < variables.size(); ++i) { - auto& t = variables[i]; - Variable* var = _root_scope->FindVar(t); - LoDTensor* tensor = var->GetMutable(); - - float* w = tensor->data(); - paddle::ps::Region reg(w, tensor->numel()); - regions[i] = std::move(reg); - } - return _ps_client->pull_dense(regions.data(), regions.size(), table_id); + auto& regions = _regions[table_id]; + regions.clear(); + auto& variables = _dense_variable_name[table_id]; + regions.resize(variables.size()); + + for (auto i = 0u; i < variables.size(); ++i) { + auto& t = variables[i]; + Variable* var = _root_scope->FindVar(t); + LoDTensor* tensor = var->GetMutable(); + + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + return _ps_client->pull_dense(regions.data(), regions.size(), table_id); } void DensePullThread::wait_all() { - for (auto& t : _pull_dense_status) { - t.wait(); - auto status = t.get(); - if (status != 0) { - LOG(WARNING) << "pull dense failed times:" << - ++_pull_dense_fail_times; - } + for (auto& t : _pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense failed times:" << + ++_pull_dense_fail_times; } - - if (_pull_dense_fail_times > 20) { - LOG(FATAL) << "pull dense failed times more than 20 times"; - exit(-1); - } - - _pull_dense_status.resize(0); + } + + if (_pull_dense_fail_times > 20) { + LOG(FATAL) << "pull dense failed times more than 20 times"; + exit(-1); + } + + _pull_dense_status.resize(0); } void DensePullThread::increase_thread_version( int thread_id, uint64_t table_id) { - std::lock_guard lock(_mutex_for_version); - _training_versions[table_id][thread_id]++; + std::lock_guard lock(_mutex_for_version); + _training_versions[table_id][thread_id]++; } -#endif +#endif void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); @@ -336,56 +336,56 @@ void AsyncExecutorThreadWorker::TrainFiles() { void AsyncExecutorThreadWorker::SetPSlibPtr( std::shared_ptr pslib_ptr) { - _pslib_ptr = pslib_ptr; + _pslib_ptr = pslib_ptr; } void AsyncExecutorThreadWorker::SetPullDenseThread( std::shared_ptr dpt) { - _pull_dense_thread = dpt; + _pull_dense_thread = dpt; } void AsyncExecutorThreadWorker::TrainOneNetwork() { - PrepareParams(); - - for (auto& op : ops_) { - if (op->Type().find("sgd") != std::string::npos) { - continue; - } - bool need_skip = false; - for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { - if (op->Type().find(_param_config->skip_op[t]) != - std::string::npos) { - need_skip = true; - break; - } - } - if (!need_skip) { - op->Run(*thread_scope_, place_); - } + PrepareParams(); + + for (auto& op : ops_) { + if (op->Type().find("sgd") != std::string::npos) { + continue; + } + bool need_skip = false; + for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { + if (op->Type().find(_param_config->skip_op[t]) != + std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); } - UpdateParams(); + } + UpdateParams(); } void AsyncExecutorThreadWorker::SetParamConfig( AsyncWorkerParamConfig* param_config) { - _param_config = param_config; + _param_config = param_config; } void AsyncExecutorThreadWorker::PrepareParams() { - for (auto table_id : _param_config->sparse_table_id) { - PullSparse(table_id); - for (auto& t : _pull_sparse_status) { - t.wait(); - auto status = t.get(); - if (status != 0) { - LOG(ERROR) << "pull sparse failed, status[" << status << "]"; - exit(-1); - } - } + for (auto table_id : _param_config->sparse_table_id) { + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } } - _pull_sparse_status.resize(0); + } + _pull_sparse_status.resize(0); - for (auto table_id : _param_config->sparse_table_id) { - FillSparse(table_id); - } + for (auto table_id : _param_config->sparse_table_id) { + FillSparse(table_id); + } } void AsyncExecutorThreadWorker::UpdateParams() { @@ -426,21 +426,20 @@ void AsyncExecutorThreadWorker::UpdateParams() { } void AsyncExecutorThreadWorker::PushDense(int table_id) { - std::vector regions; - for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { - Variable* var = thread_scope_->FindVar(t); - CHECK(var != nullptr) << "var[" << t << "] not found"; - LoDTensor* tensor = var->GetMutable(); - int count = tensor->numel(); - float* g = tensor->data(); - paddle::ps::Region reg(g, count); - regions.emplace_back(std::move(reg)); - } - - auto status = _pslib_ptr->_worker_ptr->push_dense( - regions.data(), regions.size(), table_id); - _push_dense_status.push_back(std::move(status)); - + std::vector regions; + for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { + Variable* var = thread_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = _pslib_ptr->_worker_ptr->push_dense( + regions.data(), regions.size(), table_id); + _push_dense_status.push_back(std::move(status)); } void AsyncExecutorThreadWorker::PullSparse(int table_id) { @@ -643,24 +642,24 @@ void AsyncExecutorThreadWorker::check_pull_push_memory( const std::vector& features, std::vector>& push_g, int dim) { - push_g.resize(features.size() + 1); - for (auto& t : push_g) { - t.resize(dim); - } + push_g.resize(features.size() + 1); + for (auto& t : push_g) { + t.resize(dim); + } } void AsyncExecutorThreadWorker::check_pull_push_memory( - const std::vector& features, - std::vector& push_g, - int dim) { - if (features.size() > push_g.size()) { - push_g.reserve(features.size() + 1); - auto size = features.size() - push_g.size() + 1; - for (auto i = 0u; i < size; ++i) { - float* ptr = new float[dim]; - push_g.push_back(ptr); - } + const std::vector& features, + std::vector& push_g, + int dim) { + if (features.size() > push_g.size()) { + push_g.reserve(features.size() + 1); + auto size = features.size() - push_g.size() + 1; + for (auto i = 0u; i < size; ++i) { + float* ptr = new float[dim]; + push_g.push_back(ptr); } + } } #endif diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index c23eb09470..20410b4c06 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -67,79 +67,79 @@ struct DensePullThreadParam { class DensePullThread { public: explicit DensePullThread(const DensePullThreadParam& param) : - _running(false) { - _ps_client = param.ps_client; - _threshold = param.threshold; - _thread_num = param.training_thread_num; - _root_scope = param.root_scope; - _sleep_time_ms = param.sleep_time_ms; - - for (auto& t : *param.dense_params) { - _dense_variable_name[t.first].insert( - _dense_variable_name[t.first].end(), - t.second.begin(), t.second.end()); - _training_versions[t.first].resize(_thread_num, 0); - _last_versions[t.first] = 0; - _current_version[t.first] = 0; - } + _running(false) { + _ps_client = param.ps_client; + _threshold = param.threshold; + _thread_num = param.training_thread_num; + _root_scope = param.root_scope; + _sleep_time_ms = param.sleep_time_ms; + + for (auto& t : *param.dense_params) { + _dense_variable_name[t.first].insert( + _dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); + _training_versions[t.first].resize(_thread_num, 0); + _last_versions[t.first] = 0; + _current_version[t.first] = 0; } - - int start(); - - void stop() { - if (_running) { - _running = false; - _t.join(); - } + } + + int start(); + + void stop() { + if (_running) { + _running = false; + _t.join(); } - - void increase_thread_version(int thread_id, uint64_t table_id); - void reset_thread_version(uint64_t table_id); - std::future pull_dense(uint64_t table_id); - void pull_dense2(uint64_t table_id); - void wait_all(); - + } + + void increase_thread_version(int thread_id, uint64_t table_id); + void reset_thread_version(uint64_t table_id); + std::future pull_dense(uint64_t table_id); + void pull_dense2(uint64_t table_id); + void wait_all(); + private: - void run(); - bool check_update_param(uint64_t table_id); - + void run(); + bool check_update_param(uint64_t table_id); + private: - std::shared_ptr _ps_client; - int _thread_num; - int _threshold; - int _sleep_time_ms; - Scope* _root_scope; - bool _running; - - std::map _last_versions; - std::map _current_version; - std::mutex _mutex_for_version; - std::map> _training_versions; - std::map> _dense_variable_name; - - std::thread _t; - - std::vector<::std::future> _pull_dense_status; - - std::map> _regions; - uint32_t _pull_dense_fail_times = 0; - - std::vector _base_norm_param; - std::vector _mean; - std::vector _scale; - float _squared_sum_epsilon = 1e-4; - std::mutex _mutex_for_mean_scale; - - float _total_batch_num = 0; + std::shared_ptr _ps_client; + int _thread_num; + int _threshold; + int _sleep_time_ms; + Scope* _root_scope; + bool _running; + + std::map _last_versions; + std::map _current_version; + std::mutex _mutex_for_version; + std::map> _training_versions; + std::map> _dense_variable_name; + + std::thread _t; + + std::vector<::std::future> _pull_dense_status; + + std::map> _regions; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; + float _squared_sum_epsilon = 1e-4; + std::mutex _mutex_for_mean_scale; + + float _total_batch_num = 0; }; #endif class ExecutorThreadWorker { public: - ExecutorThreadWorker() - : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} +ExecutorThreadWorker() + : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} virtual ~ExecutorThreadWorker() {} - + void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); void SetThreadId(int tid); @@ -160,7 +160,7 @@ class ExecutorThreadWorker { void SetFetchVarNames(const std::vector& fetch_var_names); #ifdef PADDLE_WITH_PSLIB virtual void SetPSlibPtr( - std::shared_ptr pslib_ptr) {}; + std::shared_ptr pslib_ptr) {} virtual void SetPullDenseThread( std::shared_ptr dpt) {} virtual void SetParamConfig( @@ -218,32 +218,32 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker { void check_pull_push_memory(const std::vector& features, std::vector>& push_g, int dim); - void collect_feasign_info(int table_id); - + void collect_feasign_info(int table_id); + private: - struct FeasignInfo { - uint32_t slot; - uint32_t ins; - int64_t label; - }; - - std::map> _features; - std::map> _fea_info; - std::map>> _feature_value; - std::map>> _feature_push_value; - - - std::shared_ptr _pslib_ptr; - - std::shared_ptr _pull_dense_thread; - - std::vector<::std::future> _pull_sparse_status; - std::vector<::std::future> _pull_dense_status; - std::vector<::std::future> _push_sparse_status; - std::vector<::std::future> _push_dense_status; - - AsyncWorkerParamConfig* _param_config; - + struct FeasignInfo { + uint32_t slot; + uint32_t ins; + int64_t label; + }; + + std::map> _features; + std::map> _fea_info; + std::map>> _feature_value; + std::map>> _feature_push_value; + + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; + }; #endif From e3c4b0dacee78d49a4701db788375b02d0916d6a Mon Sep 17 00:00:00 2001 From: SunGaofeng Date: Thu, 13 Dec 2018 15:46:12 +0800 Subject: [PATCH 294/719] this is for psroi_pool op, test=develop (#14796) * Add psroi_pool operator. --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/psroi_pool_op.cc | 173 +++++++++++ paddle/fluid/operators/psroi_pool_op.cu | 294 ++++++++++++++++++ paddle/fluid/operators/psroi_pool_op.h | 253 +++++++++++++++ python/paddle/fluid/layers/nn.py | 55 ++++ .../fluid/tests/unittests/test_layers.py | 10 + .../tests/unittests/test_psroi_pool_op.py | 134 ++++++++ 7 files changed, 920 insertions(+) create mode 100644 paddle/fluid/operators/psroi_pool_op.cc create mode 100644 paddle/fluid/operators/psroi_pool_op.cu create mode 100644 paddle/fluid/operators/psroi_pool_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_psroi_pool_op.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index fd4cf92d85..8e6482ca98 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -198,6 +198,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc new file mode 100644 index 0000000000..6978d9c5dc --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/psroi_pool_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), " + "the input of PSROIPoolOp. " + "The format of input tensor is NCHW. Where N is the batch size, " + "C is the number of input channels, " + "H is the height of the input feature map, and " + "W is the width."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to pool over. " + "should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]. " + "where (x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates. " + "The roi batch index can be calculated from LoD."); + AddOutput("Out", + "(Tensor), " + "the output of PSROIPoolOp is a 4-D Tensor with shape " + "(num_rois, output_channels, pooled_h, pooled_w)."); + AddAttr( + "output_channels", + "(int), " + "the number of channels of the output feature map. " + "For a task of C classes of objects, output_channels should be " + "(C + 1) for classification only."); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "the pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "the pooled output width.") + .SetDefault(1); + AddComment(R"Doc( +**PSROIPool Operator** + +Position sensitive region of interest pooling (also known as PSROIPooling) is to perform +position-sensitive average pooling on regions of interest specified by input, takes as +input N position-sensitive score maps and a list of num_rois regions of interest. + +PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details. + )Doc"); + } +}; + +class PSROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PSROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of PSROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PSROIPoolOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + + PADDLE_ENFORCE(input_dims.size() == 4, + "The format of input tensor is NCHW"); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + PADDLE_ENFORCE(rois_dims[1] == 4, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + + int pooled_height = ctx->Attrs().Get("pooled_height"); + int pooled_width = ctx->Attrs().Get("pooled_width"); + int output_channels = ctx->Attrs().Get("output_channels"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE( + input_dims[1] == output_channels * pooled_height * pooled_width, + "the channel of X(%d) should be equal to the product of " + "output_channels(%d), pooled_height(%d) and pooled_width(%d)", + input_dims[1], output_channels, pooled_height, pooled_width); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must be greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must be greater than 0"); + PADDLE_ENFORCE_GT(output_channels, 1, + "The pooled output channels must greater than 1"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0."); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = + output_channels; // input_dims[1] / (pooled_height * pooled_width); + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + ctx->SetOutputDim("Out", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class PSROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); +REGISTER_OP_CPU_KERNEL( + psroi_pool, + ops::CPUPSROIPoolOpKernel, + ops::CPUPSROIPoolOpKernel); +REGISTER_OP_CPU_KERNEL( + psroi_pool_grad, + ops::CPUPSROIPoolGradOpKernel, + ops::CPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu new file mode 100644 index 0000000000..22fec3244f --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -0,0 +1,294 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/psroi_pool_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void GPUPSROIPoolForward( + const int nthreads, const T* input_data, const T* input_rois, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* output_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + const T* offset_input_data = + input_data + + (roi_batch_id * input_channels + input_channel) * height * width; + T outsum = 0; + + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + outsum += offset_input_data[input_index]; + } + } + + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + output_data[i] = is_empty ? 0. : outsum / bin_area; + } +} + +template +__global__ void GPUPSROIPoolBackward( + const int nthreads, const T* input_rois, const T* output_grad_data, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val); + } + } + } +} + +template +class GPUPSROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + int rois_num = rois->dims()[0]; + if (rois_num == 0) return; + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "The rois_batch_size and input(X) batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + + // set rois batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + // call cuda kernel function + GPUPSROIPoolForward< + T><<>>( + output_size, in->data(), rois->data(), spatial_scale, + input_channels, height, width, output_channels, pooled_height, + pooled_width, rois_batch_id_list_gpu.data(), + out->mutable_data(ctx.GetPlace())); + } +}; + +template +class GPUPSROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + int rois_num = rois->dims()[0]; + int input_channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (input_grad) { + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + input_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); + + int output_grad_size = output_grad->numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPUPSROIPoolBackward< + T><<>>( + output_grad_size, rois->data(), output_grad->data(), + spatial_scale, input_channels, height, width, output_channels, + pooled_height, pooled_width, rois_batch_id_list_gpu.data(), + input_grad->mutable_data(ctx.GetPlace())); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + psroi_pool, + ops::GPUPSROIPoolOpKernel, + ops::GPUPSROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( + psroi_pool_grad, + ops::GPUPSROIPoolGradOpKernel, + ops::GPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h new file mode 100644 index 0000000000..1a424728f7 --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -0,0 +1,253 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class CPUPSROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_channels = ctx.Attr("output_channels"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto in_stride = framework::stride(in_dims); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out->dims()); + + const T* input_data = in->data(); + + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + T* output_data = out->mutable_data(ctx.GetPlace()); + const T* input_rois = rois->data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // set roi batch id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small rois to be 1 x 1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute bin size w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // calculate each pixel of the output feature map. + int out_roi_offset = n * out_stride[0]; + for (int c = 0; c < output_channels; ++c) { + // per category + int out_plane_offset = out_roi_offset + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + int out_row_offset = out_plane_offset + ph * out_stride[2]; + for (int pw = 0; pw < pooled_width; ++pw) { + // calculate w and h at input feature map + int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + wstart = std::min(std::max(wstart, 0), width); + hend = std::min(std::max(hend, 0), height); + wend = std::min(std::max(wend, 0), width); + + int output_index = out_row_offset + pw; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_plane_offset = + roi_batch_id * in_stride[0] + input_channel * in_stride[1]; + const T* offset_input_data = input_data + input_plane_offset; + T out_sum = 0.; + bool is_empty = (hend <= hstart) || (wend <= wstart); + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * in_stride[2] + iw; + out_sum += offset_input_data[input_index]; + } + } + T bin_area = (hend - hstart) * (wend - wstart); + output_data[output_index] = is_empty ? 0. : out_sum / bin_area; + } + } + } + } + return; + } +}; + +template +class CPUPSROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + if (input_grad) { + auto in_dims = in->dims(); + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + const T* input_rois = rois->data(); + const T* output_grad_data = output_grad->data(); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + + // set gradient of X to be 0. before backpropagate. + math::SetConstant set_zero; + set_zero(ctx.template device_context(), input_grad, + static_cast(0)); + + // backpropagate gradient per output pixel + int output_grad_size = output_grad->numel(); + for (int i = 0; i < output_grad_size; ++i) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + offset_input_grad_data[input_index] += diff_val; + } + } + } + } + return; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e25eaaa9fd..3832cae8c3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -173,6 +173,7 @@ __all__ = [ 'merge_selected_rows', 'get_tensor_from_selected_rows', 'lstm', + 'psroi_pool', ] kIgnoreIndex = -100 @@ -9122,3 +9123,57 @@ def get_tensor_from_selected_rows(x, name=None): outputs={'Out': out}, attrs={}) return out + + +@templatedoc() +def psroi_pool(input, + rois, + output_channels, + spatial_scale, + pooled_height, + pooled_width, + name=None): + """ + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + output_channels (integer): ${output_channels_comment} + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + name (str, default None): The name of this layer. + + Returns: + Variable: ${out_comment}. + + Examples: + .. code-block:: python + + pool_out = fluid.layers.psroi_pool(input=x, rois=rois, 490, 1.0, 7, 7) + """ + helper = LayerHelper('psroi_pool', **locals()) + # check attrs + if not isinstance(output_channels, int): + raise TypeError("output_channels must be int type") + if not isinstance(spatial_scale, float): + raise TypeError("spatial_scale must be float type") + if not isinstance(pooled_height, int): + raise TypeError("pooled_height must be int type") + if not isinstance(pooled_width, int): + raise TypeError("pooled_width must be int type") + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='psroi_pool', + inputs={'X': input, + 'ROIs': rois}, + outputs={'Out': out}, + attrs={ + 'output_channels': output_channels, + 'spatial_scale': spatial_scale, + 'pooled_height': pooled_height, + 'pooled_width': pooled_width + }) + return out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 10e8bb5a86..fb3e4da1ef 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -511,6 +511,16 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_psroi_pool(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[245, 30, 30], dtype="float32") + rois = layers.data( + name="rois", shape=[4], dtype="float32", lod_level=1) + output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7) + self.assertIsNotNone(output) + print(str(program)) + def test_roi_align(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py new file mode 100644 index 0000000000..abe014a38c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py @@ -0,0 +1,134 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import math +import numpy as np +import unittest +from op_test import OpTest + + +class TestPSROIPoolOp(OpTest): + def set_data(self): + self.init_test_case() + self.make_rois() + self.calc_psroi_pool() + self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)} + self.attrs = { + 'output_channels': self.output_channels, + 'spatial_scale': self.spatial_scale, + 'pooled_height': self.pooled_height, + 'pooled_width': self.pooled_width + } + self.outputs = {'Out': self.outs} + + def init_test_case(self): + self.batch_size = 3 + self.channels = 3 * 2 * 2 + self.height = 6 + self.width = 4 + + self.x_dim = [self.batch_size, self.channels, self.height, self.width] + + self.spatial_scale = 1.0 / 4.0 + self.output_channels = 3 + self.pooled_height = 2 + self.pooled_width = 2 + + self.x = np.random.random(self.x_dim).astype('float32') + + def make_rois(self): + rois = [] + self.rois_lod = [[]] + for bno in range(self.batch_size): + self.rois_lod[0].append(bno + 1) + for i in range(bno + 1): + x1 = np.random.random_integers( + 0, self.width // self.spatial_scale - self.pooled_width) + y1 = np.random.random_integers( + 0, self.height // self.spatial_scale - self.pooled_height) + + x2 = np.random.random_integers(x1 + self.pooled_width, + self.width // self.spatial_scale) + y2 = np.random.random_integers( + y1 + self.pooled_height, self.height // self.spatial_scale) + roi = [bno, x1, y1, x2, y2] + rois.append(roi) + self.rois_num = len(rois) + self.rois = np.array(rois).astype('float32') + + def calc_psroi_pool(self): + output_shape = (self.rois_num, self.output_channels, self.pooled_height, + self.pooled_width) + out_data = np.zeros(output_shape) + for i in range(self.rois_num): + roi = self.rois[i] + roi_batch_id = int(roi[0]) + roi_start_w = round(roi[1]) * self.spatial_scale + roi_start_h = round(roi[2]) * self.spatial_scale + roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale + roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale + + roi_height = max(roi_end_h - roi_start_h, 0.1) + roi_width = max(roi_end_w - roi_start_w, 0.1) + + bin_size_h = roi_height / float(self.pooled_height) + bin_size_w = roi_width / float(self.pooled_width) + + x_i = self.x[roi_batch_id] + + for c in range(self.output_channels): + for ph in range(self.pooled_height): + for pw in range(self.pooled_width): + hstart = int( + math.floor(float(ph) * bin_size_h + roi_start_h)) + wstart = int( + math.floor(float(pw) * bin_size_w + roi_start_w)) + hend = int( + math.ceil( + float(ph + 1) * bin_size_h + roi_start_h)) + wend = int( + math.ceil( + float(pw + 1) * bin_size_w + roi_start_w)) + hstart = min(max(hstart, 0), self.height) + hend = min(max(hend, 0), self.height) + wstart = min(max(wstart, 0), self.width) + wend = min(max(wend, 0), self.width) + + c_in = (c * self.pooled_height + ph + ) * self.pooled_width + pw + is_empty = (hend <= hstart) or (wend <= wstart) + out_sum = 0. + for ih in range(hstart, hend): + for iw in range(wstart, wend): + out_sum += x_i[c_in, ih, iw] + bin_area = (hend - hstart) * (wend - wstart) + out_data[i, c, ph, pw] = 0. if is_empty else ( + out_sum / float(bin_area)) + self.outs = out_data.astype('float32') + + def setUp(self): + self.op_type = 'psroi_pool' + self.set_data() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main() From 7b10bf0e60e9ac0f56ff532fe58cbf5c538a81b6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 13 Dec 2018 16:40:51 +0800 Subject: [PATCH 295/719] Use mkl --- .../fluid/operators/hierarchical_sigmoid_op.h | 28 ++++++++++++------- paddle/fluid/operators/math/blas.h | 8 ++++++ paddle/fluid/operators/math/blas_impl.h | 21 ++++++++++++++ paddle/fluid/platform/dynload/mklml.h | 2 ++ 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b73a32af89..d212e6f843 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { label.data())); } - auto& place = *ctx.template device_context().eigen_device(); - auto pre_out_mat = EigenMatrix::From(pre_out); - auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); - auto out_grad_mat = EigenMatrix::From(out_grad); + // softrelu derivative - Eigen::array bcast{1, static_cast(pre_out_grad.dims()[1])}; + auto blas = math::GetBlas(ctx); - // softrelu derivative - pre_out_grad_mat.device(place) = - static_cast(1.0) - static_cast(1.0) / pre_out_mat.exp(); + auto* pre_out_grad_data = pre_out_grad.data(); + auto* pre_out_data = pre_out.data(); + auto n = pre_out.numel(); + blas.VEXP(n, pre_out_data, pre_out_grad_data); + blas.VINV(n, pre_out_grad_data, pre_out_grad_data); + for (int64_t i = 0; i < n; ++i) { + pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; + } bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) - pre_out_grad_mat.device(place) = - pre_out_grad_mat * out_grad_mat.broadcast(bcast); + auto* out_grad_data = out_grad.data(); + + int64_t dim0 = pre_out_grad.dims()[0]; + int64_t dim1 = pre_out_grad.dims()[1]; + for (int64_t i = 0; i < dim0; ++i) { + T tmp = out_grad_data[i]; + blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); + } // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 9f3a81f22c..f67f57827b 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -181,6 +181,9 @@ class Blas { const framework::Tensor& mat_b, const MatDescriptor& dim_b, T alpha, framework::Tensor* mat_out, T beta) const; + template + void VINV(int n, const T* a, T* y) const; + private: const DeviceContext& context_; }; @@ -282,6 +285,11 @@ class BlasT : private Blas { Base()->template BatchedGEMM(args...); } + template + void VINV(ARGS... args) const { + Base()->template VINV(args...); + } + private: const Blas* Base() const { return static_cast*>(this); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index c84087bb1e..972366bc09 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -118,6 +118,11 @@ struct CBlas { static void VPOW(ARGS... args) { platform::dynload::vsPowx(args...); } + + template + static void VINV(ARGS... args) { + platform::dynload::vsInv(args...); + } }; template <> @@ -213,6 +218,11 @@ struct CBlas { static void VPOW(ARGS... args) { platform::dynload::vdPowx(args...); } + + template + static void VINV(ARGS... args) { + platform::dynload::vdInv(args...); + } }; #else @@ -603,6 +613,17 @@ void Blas::MatMul(const framework::Tensor &mat_a, dim_a.stride_, dim_b.stride_); } } +template +template +void Blas::VINV(int n, const T *a, T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VINV(n, a, y); +#else + for (int i = 0; i < n; ++i) { + y[i] = 1.0 / a[i]; + } +#endif +} } // namespace math } // namespace operators diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index f0a9736623..c3f9433503 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -82,6 +82,8 @@ extern void* mklml_dso_handle; __macro(vdSqr); \ __macro(vsPowx); \ __macro(vdPowx); \ + __macro(vsInv); \ + __macro(vdInv); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); From 47ea2534fb9cac31f1b5c15c54112e6105810cb1 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 13 Dec 2018 17:12:38 +0800 Subject: [PATCH 296/719] clean parallel do test=develop --- .../operators/controlflow/parallel_do_op.cc | 426 ------------------ python/paddle/fluid/backward.py | 79 +--- python/paddle/fluid/framework.py | 4 +- python/paddle/fluid/layers/control_flow.py | 152 +------ .../tests/book/notest_understand_sentiment.py | 18 +- .../fluid/tests/book/test_recognize_digits.py | 15 +- .../paddle/fluid/tests/book/test_word2vec.py | 14 +- .../test_memopt_fit_a_line.py | 87 ---- .../fluid/tests/unittests/test_parallel_op.py | 235 ---------- .../memory_optimization_transpiler.py | 5 +- 10 files changed, 10 insertions(+), 1025 deletions(-) delete mode 100644 paddle/fluid/operators/controlflow/parallel_do_op.cc delete mode 100644 python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py delete mode 100644 python/paddle/fluid/tests/unittests/test_parallel_op.py diff --git a/paddle/fluid/operators/controlflow/parallel_do_op.cc b/paddle/fluid/operators/controlflow/parallel_do_op.cc deleted file mode 100644 index ab25628d45..0000000000 --- a/paddle/fluid/operators/controlflow/parallel_do_op.cc +++ /dev/null @@ -1,426 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/safe_ref.h" - -namespace paddle { -namespace operators { - -static constexpr char kInputs[] = "inputs"; -static constexpr char kParameters[] = "parameters"; -static constexpr char kPlaces[] = "places"; - -static constexpr char kOutputs[] = "outputs"; -static constexpr char kParallelScopes[] = "parallel_scopes"; - -static constexpr char kParallelBlock[] = "sub_block"; -static constexpr char kUseNCCL[] = "use_nccl"; - -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; - -static void SplitTensorAndMoveTensorToScopes( - const framework::Scope &scope, std::vector *sub_scopes, - const std::vector &places, - const std::vector &names) { - size_t num_sub_scopes = 0; - for (auto &argu : names) { - const auto &tensor = - detail::Ref(scope.FindVar(argu), - "Cannot find variable %s in the parent scope", argu) - .Get(); - auto lod_tensors = tensor.SplitLoDTensor(places); - - for (auto &lod : lod_tensors) { - VLOG(3) << lod.dims(); - } - if (num_sub_scopes == 0) { - num_sub_scopes = lod_tensors.size(); - } else { - PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size()); - } - PADDLE_ENFORCE_NE(num_sub_scopes, 0); - if (sub_scopes->size() == 0) { - sub_scopes->reserve(num_sub_scopes); - for (size_t i = 0; i < num_sub_scopes; ++i) { - sub_scopes->emplace_back(&scope.NewScope()); - } - } - - for (size_t i = 0; i < lod_tensors.size(); ++i) { - *detail::Ref(sub_scopes->at(i)->Var(argu), - "Cannot find variable in the sub-scope", argu) - .GetMutable() = lod_tensors[i]; - } - } -} - -inline void CopyOrShare(const framework::Variable &src, - const platform::Place &dst_place, - framework::Variable *dst) { - if (src.IsType()) { - if (src.Get().place() == dst_place) { - dst->GetMutable()->ShareDataWith(src.Get()); - dst->GetMutable()->set_lod(src.Get().lod()); - } else { - TensorCopy(src.Get(), dst_place, dst->GetMutable()); - } - } else if (src.IsType()) { - auto &src_sr = src.Get(); - auto *dst_sr = dst->GetMutable(); - dst_sr->set_height(src_sr.height()); - if (src_sr.value().place() == dst_place) { - dst_sr->mutable_value()->ShareDataWith(src_sr.value()); - dst_sr->set_rows(src_sr.rows()); - } else { - TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value()); - } - } else { - PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); - } -} - -void WaitOnPlace(const platform::Place place) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - dev_ctx.Wait(); -} - -void WaitOnPlaces(const std::vector places) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - - for (auto &place : places) { - auto &dev_ctx = *pool.Get(place); - dev_ctx.Wait(); - } -} - -class ParallelDoOp : public framework::OperatorBase { - public: - ParallelDoOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - auto *block = Attr(kParallelBlock); - auto *program = block->Program(); - - auto &places = scope.FindVar(Input(kPlaces))->Get(); - - auto &sub_scopes = *scope.FindVar(Output(kParallelScopes)) - ->GetMutable>(); - - // split input - SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places, - Inputs(kInputs)); - - // copy parameter - for (auto ¶m : Inputs(kParameters)) { - PADDLE_ENFORCE(scope.FindVar(param)->IsType(), - "Only support parameter type as LoDTensor"); - auto &src = scope.FindVar(param)->Get(); - - auto *sub_scope0 = sub_scopes[0]; - auto *dst0 = sub_scope0->Var(param)->GetMutable(); - dst0->ShareDataWith(src); - - for (size_t i = 1; i < sub_scopes.size(); ++i) { - auto &place = places[i]; - auto *sub_scope = sub_scopes[i]; - auto *dst = sub_scope->Var(param)->GetMutable(); - framework::TensorCopy(src, place, dst); - } - } - WaitOnPlaces(places); - - std::vector> workers; - workers.reserve(places.size()); - for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) { - auto &place = places[place_idx]; - auto *cur_scope = sub_scopes[place_idx]; - - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); - } - for (auto &worker : workers) { - worker.wait(); - } - WaitOnPlaces(places); - - // merge output - for (auto &o_name : Outputs(kOutputs)) { - std::vector lod_tensors; - lod_tensors.reserve(sub_scopes.size()); - for (auto *sub_scope : sub_scopes) { - lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get()); - } - - auto *lod_tensor_to_be_merged = - scope.FindVar(o_name)->GetMutable(); - lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace()); - } - WaitOnPlaces(places); - } -}; - -class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kInputs, "").AsDuplicable(); - AddInput(kParameters, "").AsDuplicable(); - AddInput(kPlaces, ""); - AddOutput(kOutputs, "").AsDuplicable(); - AddOutput(kParallelScopes, ""); - AddAttr(kParallelBlock, ""); - AddAttr(kUseNCCL, "true if we use nccl on backward") - .SetDefault(false); - AddComment(R"DOC( -ParallelDo Operator. -)DOC"); - } -}; - -class ParallelDoGradOp : public framework::OperatorBase { - public: - ParallelDoGradOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto *block = Attr(kParallelBlock); - auto *program = block->Program(); - - auto &sub_scopes = scope.FindVar(Input(kParallelScopes)) - ->Get>(); - auto &places = scope.FindVar(Input(kPlaces))->Get(); - - // feed output@grad - SplitTensorAndMoveTensorToScopes( - scope, const_cast *>(&sub_scopes), - places, Inputs(framework::GradVarName(kOutputs))); - WaitOnPlaces(places); - - // exe run - std::vector> workers; - for (size_t i = 0; i < sub_scopes.size(); ++i) { - auto &place = places[i]; - auto *cur_scope = sub_scopes[i]; - - // execute - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); - } - for (auto &worker : workers) { - worker.wait(); - } - WaitOnPlaces(places); - - // NCCL allreduce op will be added by backward, - // so no need to explicitly accumulate grad - if (!(Attr(kUseNCCL))) { - AccumulateGrad(scope, place, sub_scopes, places); - } else { - for (auto &place : places) { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "NCCL only supports cuda place"); - } - } - for (auto &s : Outputs(framework::GradVarName(kParameters))) { - if (s == framework::kEmptyVarName) { - continue; - } - VLOG(3) << "Moving " << s; - CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); - } - WaitOnPlaces(places); - } - - void AccumulateGrad(const framework::Scope &scope, - const platform::Place &place, - const std::vector &sub_scopes, - const platform::PlaceList &places) const { - for (auto &s : Outputs(framework::GradVarName(kParameters))) { - if (s == framework::kEmptyVarName) { - continue; - } - VLOG(3) << "Accumulating " << s; - if (s == framework::kEmptyVarName) continue; - std::string tmp_name; - auto *tmp = sub_scopes[0]->Var(&tmp_name); - - for (size_t i = 1; i < sub_scopes.size(); ++i) { - CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp); - WaitOnPlaces(places); - - auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, - framework::AttributeMap{{"use_mkldnn", {false}}}); - VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); - sum_op->Run(*sub_scopes[0], places[0]); - WaitOnPlace(places[0]); - } - - CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); - } - WaitOnPlaces(places); - } -}; - -std::ostream &operator<<(std::ostream &sout, - const std::vector &strs) { - std::copy(strs.begin(), strs.end(), - std::ostream_iterator(sout, ",")); - return sout; -} - -class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - virtual std::unique_ptr Apply() const { - auto *grad = new framework::OpDesc(); - grad->SetType("parallel_do_grad"); - for (auto &input_param : this->InputNames()) { - VLOG(3) << input_param; - grad->SetInput(input_param, this->Input(input_param)); - if (input_param != kPlaces) { - grad->SetOutput(framework::GradVarName(input_param), - this->InputGrad(input_param, false)); - } - } - auto *g_block = this->grad_block_[0]; - - // All variable name that needed by gradient operators - std::unordered_set all_inputs_in_grad_blocks; - - for (size_t i = 0; i < g_block->OpSize(); ++i) { - auto *op = g_block->Op(i); - for (auto &var_name : op->InputArgumentNames()) { - all_inputs_in_grad_blocks.insert(var_name); - } - } - - for (auto &output_param : this->OutputNames()) { - if (output_param == kParallelScopes) { - grad->SetInput(output_param, this->Output(output_param)); - grad->SetInput(framework::GradVarName(output_param), - this->Output(output_param)); - } else { - grad->SetInput(output_param, this->Output(output_param)); - std::vector og_names; - for (auto &og_name : this->OutputGrad(output_param)) { - if (all_inputs_in_grad_blocks.count(og_name) != 0) { - // there are some gradient operators who need the OG. So make this - // OG as an input of parallel.do - og_names.push_back(og_name); - } - // else, there is no operator who need the OG. Do not use this OG as - // an input - } - grad->SetInput(framework::GradVarName(output_param), og_names); - } - } - grad->SetInput("Communicator", {"nccl_com__do_not_change_"}); - grad->SetAttrMap(this->Attrs()); - grad->SetBlockAttr(kParallelBlock, grad_block_[0]); - - return std::unique_ptr(grad); - } -}; - -class ParallelDoGradOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInputs(kParameters)); - PADDLE_ENFORCE(ctx->HasInputs(kInputs)); - PADDLE_ENFORCE(ctx->HasInputs(kOutputs)); - - ctx->SetOutputsDim(framework::GradVarName(kParameters), - ctx->GetInputsDim(kParameters)); - - auto i_dims = ctx->GetInputsDim(kInputs); - auto ig_names = ctx->Outputs(framework::GradVarName(kInputs)); - - for (size_t i = 0; i < ig_names.size(); ++i) { - auto &ig_name = ig_names[i]; - if (ig_name == framework::kEmptyVarName) { - continue; - } - - ctx->SetDims({ig_name}, {i_dims[i]}); - } - - auto p_dims = ctx->GetInputsDim(kParameters); - auto pg_names = ctx->Outputs(framework::GradVarName(kParameters)); - for (size_t i = 0; i < pg_names.size(); ++i) { - auto &pg_name = pg_names[i]; - if (pg_name == framework::kEmptyVarName) { - continue; - } - ctx->SetDims({pg_name}, {p_dims[i]}); - } - } -}; - -class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - framework::BlockDesc *sub_block = - boost::get(op_desc.GetAttr(kParallelBlock)); - for (auto &out_vars : op_desc.Outputs()) { - for (auto &out_var : out_vars.second) { - auto &var = block->FindRecursiveOrCreateVar(out_var); - auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var); - if (sub_var.GetType() != var.GetType()) { - var.SetType(sub_var.GetType()); - } - } - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, - paddle::operators::ParallelDoOpProtoMaker, - paddle::operators::ParallelDoGradOpDescMaker); -REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, - paddle::operators::ParallelDoGradOpShapeInference, - paddle::operators::ParallelDoGradOpVarTypeInference); diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 17fe8dc3c8..b2c3e7c989 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -249,69 +249,6 @@ def serialize_op_decs(op_desc): return proto.__str__() -def _callback_lookup_(op): - """ - Only used in _append_backward_ops_ - Build and returns a callback function for certain op. For example - - parallel_do: AllReduce - - :param op: - :return: callback function - """ - if op.type == 'parallel_do' and op.attr('use_nccl'): - all_vars = op.block.vars - param_names = set(op.input('parameters')) - param_names = [ - name for name in param_names - if all_vars[name].stop_gradient is False - ] - param_grad_names = [n + "@GRAD" for n in param_names] - - class ParallelDoCallBack(object): - def __init__(self, param_grad_names, parallel_scopes_name): - self.has_inserted_nccl_init = False - self.param_grad_names = param_grad_names - self.parallel_scopes_name = parallel_scopes_name - - def __call__(self, block, context): - if not self.has_inserted_nccl_init: - op_desc = _create_op_desc_( - "ncclInit", - {"parallel_scopes": self.parallel_scopes_name}, - {"Communicator": ['nccl_com__do_not_change_']}, {}) - block.program.global_block().desc.append_op().copy_from( - op_desc) - self.has_inserted_nccl_init = True - - current_op_desc = context["__current_op_desc__"] - for o_param in current_op_desc.output_names(): - for o_argu in current_op_desc.output(o_param): - if o_argu in self.param_grad_names: - allreduce_out_name = o_argu + "__nccl_all_reduce__" - op_desc = _create_op_desc_( - "ncclReduce", - { - "X": [o_argu], - "Communicator": - ['nccl_com__do_not_change_'] - }, - {"Out": [allreduce_out_name]}, - {"reduction": "ncclSum", - "root": 0}, ) - block.desc.append_op().copy_from(op_desc) - - op_desc = _create_op_desc_( - "assign", {"X": [allreduce_out_name]}, - {"Out": [o_argu]}, {}) - block.desc.append_op().copy_from(op_desc) - - return ParallelDoCallBack(param_grad_names, - op.output("parallel_scopes")) - else: - return None - - def _append_backward_ops_(block, ops, target_block, @@ -349,17 +286,8 @@ def _append_backward_ops_(block, sub_block = program.block(op._block_attr_id("sub_block")) grad_sub_block = program._create_block() grad_sub_block._set_forward_block_idx(sub_block.idx) - cb = _callback_lookup_(op) - if cb is not None: - if callbacks is None: - new_callbacks = [cb] - else: - new_callbacks = callbacks + [_callback_lookup_(op)] - _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var, new_callbacks) - else: - _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var, callbacks) + _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, + no_grad_dict, grad_to_var, callbacks) program._rollback() grad_sub_block_list.append(grad_sub_block.desc) @@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): # infer_shape and infer_type op_desc.infer_var_type(block.desc) op_desc.infer_shape(block.desc) - # ncclInit dones't need to set data_type - if op_desc.type() == 'ncclInit': - continue for arg in op_desc.output_arg_names(): if arg in new_vars: _infer_var_data_type_(arg, block) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 0897920594..d0bd78454d 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -563,8 +563,8 @@ class Operator(object): OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', - 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', - 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' + 'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select', + 'checkpoint_notify', 'gen_nccl_id' } def __init__(self, diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index b7e3968569..21454370dd 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -226,156 +226,6 @@ class BlockGuard(object): return True -class ParallelDo(object): - """ - ParallelDo is used to represent multi-thread data parallel processing. - - Its vanilla implementation can be shown as the following (:math:`|` means - single thread and :math:`||||` means multiple threads) - - .. code-block:: text - - In the forward pass - | Split input onto different devices - | Copy parameter onto different devices - |||| Compute forward pass in parallel - | Merge output from different devices - - In the backward pass - | Split output@grad onto different devices - |||| Compute backward pass in parallel - | accumulate param@grad from different devices to the first device - | Merge input@grad from different devices - | Copy param@grad to the place of parallel_do_op - - Examples: - - .. code-block:: python - - images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # ParallelDo version & Single-thread version - if thread_num > 1: - places = fluid.layers.get_places(thread_num) - pd = fluid.layers.control_flow.ParallelDo(places) - with pd.do(): - images = pd.read_input(images) - label = pd.read_input(label) - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - - avg_cost = fluid.layers.mean(x=cost) - pd.write_output(avg_cost) - - avg_cost = pd() - avg_cost = fluid.layers.mean(avg_cost) - else: - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - .. warning:: - - It will be soon deprecated, please use ParallelExecutor instead. - """ - - def __init__(self, places, use_nccl=False, name=None): - warnings.warn( - "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.", - Warning) - self.helper = LayerHelper("parallel_do", name=name) - self.inputs = [] - self.places = places - self.outputs = [] - self.status = StaticRNN.BEFORE_RNN_BLOCK - self.use_nccl = use_nccl - - def do(self): - return BlockGuardWithCompletion(self) - - def parent_block(self): - prog = self.helper.main_program - parent_idx = prog.current_block().parent_idx - assert parent_idx >= 0 - parent_block = prog.block(parent_idx) - return parent_block - - def __call__(self, *args, **kwargs): - if self.status != StaticRNN.AFTER_RNN_BLOCK: - raise ValueError("RNN output can only be retrieved after rnn block") - if len(self.outputs) == 0: - raise ValueError("RNN has no output") - elif len(self.outputs) == 1: - return self.outputs[0] - else: - return self.outputs - - def read_input(self, var): - self.inputs.append(var) - return var - - def write_output(self, var): - self.outputs.append(var) - - def get_parameters(self): - main_program = self.helper.main_program - current_block = main_program.current_block() - parent_block = self.parent_block() - - local_inputs = set() - params = list() - for var in self.inputs: - local_inputs.add(var.name) - - for op in current_block.ops: - for iname in op.input_names: - for in_var_name in op.input(iname): - if in_var_name not in local_inputs: - params.append(in_var_name) - - for oname in op.output_names: - for out_var_name in op.output(oname): - local_inputs.add(out_var_name) - - params = list(set(params)) - - return [parent_block.var(name) for name in params] - - def _complete_op(self): - main_program = self.helper.main_program - current_block = main_program.current_block() - parent_block = self.parent_block() - - step_scope = parent_block.create_var( - type=core.VarDesc.VarType.STEP_SCOPES) - - self.outputs = [ - parent_block.create_var( - name=o.name, - shape=o.shape, - dtype=o.dtype, - lod_level=o.lod_level, - persistable=o.persistable, - stop_gradient=o.stop_gradient) for o in self.outputs - ] - - inputs = [parent_block.var(i.name) for i in self.inputs] - outputs = [parent_block.var(o.name) for o in self.outputs] - - parent_block.append_op( - type='parallel_do', - inputs={ - 'inputs': inputs, - 'parameters': self.get_parameters(), - 'places': self.places - }, - outputs={'outputs': outputs, - 'parallel_scopes': [step_scope]}, - attrs={'sub_block': current_block, - 'use_nccl': self.use_nccl}) - - class BlockGuardWithCompletion(BlockGuard): """ BlockGuardWithCompletion class. @@ -384,7 +234,7 @@ class BlockGuardWithCompletion(BlockGuard): """ def __init__(self, rnn): - if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)): + if not isinstance(rnn, StaticRNN): raise TypeError( "BlockGuardWithCompletion takes a StaticRNN or ParallelDo") super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program) diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index a666507bd9..5658bb4ec4 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -15,7 +15,6 @@ from __future__ import print_function from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo import unittest import paddle.fluid as fluid import paddle @@ -147,22 +146,7 @@ def train(word_dict, cost, acc_out, prediction = net_method( data, label, input_dim=dict_dim, class_dim=class_dim) else: - places = get_places() - pd = ParallelDo(places) - with pd.do(): - cost, acc, _ = net_method( - pd.read_input(data), - pd.read_input(label), - input_dim=dict_dim, - class_dim=class_dim) - pd.write_output(cost) - pd.write_output(acc) - - cost, acc = pd() - cost = fluid.layers.mean(cost) - acc_out = fluid.layers.mean(acc) - prediction = None - assert save_dirname is None + raise NotImplementedError() adagrad = fluid.optimizer.Adagrad(learning_rate=0.002) adagrad.minimize(cost) diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 4a70976a48..54936519ce 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -25,7 +25,6 @@ import numpy import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo BATCH_SIZE = 64 @@ -82,19 +81,7 @@ def train(nn_type, net_conf = conv_net if parallel: - places = get_places() - pd = ParallelDo(places) - with pd.do(): - img_ = pd.read_input(img) - label_ = pd.read_input(label) - prediction, avg_loss, acc = net_conf(img_, label_) - for o in [avg_loss, acc]: - pd.write_output(o) - - avg_loss, acc = pd() - # get mean loss and acc through every devices. - avg_loss = fluid.layers.mean(avg_loss) - acc = fluid.layers.mean(acc) + raise NotImplementedError() else: prediction, avg_loss, acc = net_conf(img, label) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 9191f0fc20..08f70c9cab 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -17,7 +17,6 @@ from __future__ import print_function import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo import unittest import os import numpy as np @@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) else: - places = get_places() - pd = ParallelDo(places) - with pd.do(): - avg_cost, predict_word = __network__( - list( - map(pd.read_input, [ - first_word, second_word, third_word, forth_word, - next_word - ]))) - pd.write_output(avg_cost) - - avg_cost = fluid.layers.mean(pd()) + raise NotImplementedError() sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py deleted file mode 100644 index dab2a52bc9..0000000000 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import math -import sys - -import paddle -import paddle.fluid as fluid -from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo - -# need to fix random seed and training data to compare the loss -# value accurately calculated by the default and the memory optimization -# version. -fluid.default_startup_program().random_seed = 111 - -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y = fluid.layers.data(name='y', shape=[1], dtype='float32') - -device_type = 'CPU' -use_nccl = False -place = fluid.CPUPlace() -if fluid.core.is_compiled_with_cuda(): - device_type = 'CUDA' - use_nccl = False - place = fluid.CUDAPlace(0) - -places = get_places(device_count=0, device_type=device_type) -pd = ParallelDo(places, use_nccl=use_nccl) -with pd.do(): - x_ = pd.read_input(x) - y_ = pd.read_input(y) - y_predict = fluid.layers.fc(input=x_, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y_) - avg_cost = fluid.layers.mean(x=cost) - pd.write_output(avg_cost) - -cost = pd() -avg_cost = fluid.layers.mean(x=cost) -sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) -sgd_optimizer.minimize(avg_cost) - -fluid.memory_optimize(fluid.default_main_program(), print_log=True) -# fluid.release_memory(fluid.default_main_program()) - -BATCH_SIZE = 200 - -# fix the order of training data -train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False) - -# train_reader = paddle.batch( -# paddle.reader.shuffle( -# paddle.dataset.uci_housing.train(), buf_size=500), -# batch_size=BATCH_SIZE) - -feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) -exe = fluid.Executor(place) - -exe.run(fluid.default_startup_program()) - -PASS_NUM = 100 -for pass_id in range(PASS_NUM): - for data in train_reader(): - avg_loss_value, = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - - if avg_loss_value[0] < 10.0: - exit(0) # if avg cost less than 10.0, we think our code is good. - print(avg_loss_value[0]) - if math.isnan(float(avg_loss_value)): - sys.exit("got NaN loss, training failed.") -exit(1) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py deleted file mode 100644 index 380e172844..0000000000 --- a/python/paddle/fluid/tests/unittests/test_parallel_op.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest - -import paddle.fluid as fluid -from paddle.fluid.layers.device import get_places -from paddle.fluid.layers.control_flow import ParallelDo -import paddle.fluid.profiler as profiler -import numpy -import six - - -class BaseParallelForTest(unittest.TestCase): - def run_test(self, callback, feed, fetch): - """ - Run the unittest for parallel.for - Args: - callback(callable): A callable function returns a generator. There - are two yields in the generator function. The first yield - returns the data layers, and the second yield returns the loss. - The modified data variables will be sent back during the first - yield. - - feed(dict): The executor feeding dictionary. - fetch(list|basestr): The fetch name lists. - - Returns: - None - - Raises: - AssertionError when the computation of cpu, parallel.for in cpu, - gpu, parallel.for in gpu are different. - - """ - cpu = fluid.CPUPlace() - result_cpu = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=cpu, - use_parallel=False) - result_cpu_parallel = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=cpu, - use_parallel=True) - if fluid.core.is_compiled_with_cuda(): - gpu = fluid.CUDAPlace(0) - result_gpu = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=gpu, - use_parallel=False, - use_gpu=True) - result_gpu_parallel = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=gpu, - use_parallel=True, - use_gpu=True) - result_gpu_nccl = self._run_test_impl_( - callback=callback, - feed=feed, - fetch=fetch, - place=gpu, - use_parallel=True, - use_nccl=True, - use_gpu=True) - self._assert_same_(fetch, result_cpu, result_cpu_parallel, - result_gpu, result_gpu_parallel, result_gpu_nccl) - else: - self._assert_same_(fetch, result_cpu, result_cpu_parallel) - - def _run_test_impl_(self, - callback, - feed, - fetch, - place, - use_parallel=False, - use_nccl=False, - use_gpu=False): - """ - Run a single test, returns the fetch values - Args: - place(Place): the computation place. - use_parallel(bool): Whether use parallel.for or not. - - Returns: - Fetched numpy arrays. - - """ - if isinstance(fetch, six.string_types): - fetch = [fetch] - main = fluid.Program() - startup = fluid.Program() - # Fix seed - main.random_seed = 10 - startup.random_seed = 10 - - with fluid.program_guard(main, startup): - generator = callback() - # Automatically insert parallel do if use_parallel = True - if use_parallel: - thread_num = fluid.core.get_cuda_device_count( - ) if use_gpu else 8 - places = get_places(thread_num) - pd = ParallelDo(places, use_nccl=use_nccl) - data = next(generator) - - if isinstance(data, fluid.framework.Variable): - data = [data] - - with pd.do(): - ins = list(map(pd.read_input, data)) - if len(ins) == 1: - ins = ins[0] - loss = generator.send(ins) # patch input - pd.write_output(loss) - - loss = pd() - else: - data = next(generator) - loss = generator.send(data) - self.assertIsNotNone(loss) - avg_loss = fluid.layers.mean(loss) - fluid.backward.append_backward(loss=avg_loss) - - exe = fluid.Executor(place) - exe.run(startup) - if use_gpu: - profile_type = 'GPU' - else: - profile_type = 'CPU' - with profiler.profiler(profile_type, 'total', '/tmp/profiler'): - return exe.run(main, feed=feed, fetch_list=fetch) - - def _assert_same_(self, fetch, *args): - """ - Assert the return values of `run_test` are same. - Args: - fetch: Fetch list. Used for print error message - *args: The fetch result lists of each situations. - - Returns: - None - - Raises: - AssertionError - - """ - - def _impl_(a, b, fetch_id, item_id): - item_str = [ - 'CPU', 'ParallelCPU', 'GPU', 'ParallelGPU', 'ParallelGPUNCCL' - ] - flag = numpy.allclose(a, b, rtol=0.1, atol=1e-3) - self.assertTrue(flag, - "The {0} are different in {1}, {2} vs {3}".format( - fetch[fetch_id], item_str[item_id], a, b)) - - for i, items in enumerate(zip(*args)): - self.assertGreater(len(items), 0) - for j in range(1, len(items)): - _impl_(items[0], items[j], fetch_id=i, item_id=j) - - -class ParallelOpTest(BaseParallelForTest): - @staticmethod - def __network__(): - x = fluid.layers.data(shape=[784], dtype='float32', name='img') - x = yield x - hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') - hidden = fluid.layers.batch_norm(input=hidden) - loss = fluid.layers.mean(hidden) - yield loss - - def test_simple_fc(self): - self.run_test( - callback=self.__network__, - feed={ - 'img': numpy.random.random(size=(51, 784)).astype('float32') - }, - fetch=['fc1.w@GRAD']) - - def test_fc_with_tiny_data(self): - self.run_test( - callback=self.__network__, - feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, - fetch=['fc1.w@GRAD']) - - -class ParallelOpTestMultipleInput(BaseParallelForTest): - @staticmethod - def __network__(): - x = fluid.layers.data( - shape=[784], dtype='float32', name='img1', stop_gradient=False) - y = fluid.layers.data( - shape=[784], dtype='float32', name='img2', stop_gradient=False) - yield [x, y] - x = x + y - hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') - hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w') - hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w') - loss = fluid.layers.mean(hidden3) - yield loss - - def test_simple_fc(self): - self.run_test( - callback=self.__network__, - feed={ - 'img1': numpy.random.random(size=(51, 784)).astype('float32'), - 'img2': numpy.random.random(size=(51, 784)).astype('float32') - }, - fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD']) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 95aafec053..d10ea4e472 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -35,11 +35,10 @@ dtype_to_size = { } SUB_BLOCK_OPS = [ - "while", "while_grad", "parallel_do", "parallel_do_grad", - "conditional_block", "conditional_block_grad" + "while", "while_grad", "conditional_block", "conditional_block_grad" ] -SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), +SUB_BLOCK_PAIR = [("while", "while_grad"), ("conditional_block", "conditional_block_grad")] PRINT_LOG = False From 36da940bc1ec69f1bdcb1d83c473136dc070fd87 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 13 Dec 2018 17:14:53 +0800 Subject: [PATCH 297/719] clean more test=develop --- python/paddle/fluid/layers/control_flow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 21454370dd..9d98e8333b 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -235,8 +235,7 @@ class BlockGuardWithCompletion(BlockGuard): def __init__(self, rnn): if not isinstance(rnn, StaticRNN): - raise TypeError( - "BlockGuardWithCompletion takes a StaticRNN or ParallelDo") + raise TypeError("BlockGuardWithCompletion takes a StaticRNN") super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program) self.rnn = rnn From fc6ec6bd1425b01a130cefe7411422e8eb62a95d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 13 Dec 2018 17:43:53 +0800 Subject: [PATCH 298/719] add sparse mode adam --- paddle/fluid/operators/optimizers/adam_op.cc | 5 +++ paddle/fluid/operators/optimizers/adam_op.h | 41 +++++++++++++------ python/paddle/fluid/optimizer.py | 7 +++- .../fluid/tests/unittests/test_adam_op.py | 20 +++++---- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39a..b2c2e5c325 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -110,6 +110,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "(float, default 1.0e-8) " "Constant for numerical stability") .SetDefault(1.0e-8f); + AddAttr( + "sparse_mode", + "(bool, default false) " + "only update the parameter that has gradient in sparse update") + .SetDefault(false); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54..ca5454ef04 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -177,12 +177,13 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; int64_t row_count_; + bool sparse_mode_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count) + int64_t row_numel, int64_t row_count, bool sparse_mode) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -198,13 +199,10 @@ struct SparseAdamFunctor { param_out_(param_out), rows_(rows), row_numel_(row_numel), - row_count_(row_count) {} - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - math::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + row_count_(row_count), + sparse_mode_(sparse_mode) {} + inline HOSTDEVICE void sparse_update(size_t i, T g) const { // The following code is the same as dense T mom1 = moment1_[i]; T mom2 = moment2_[i]; @@ -225,6 +223,13 @@ struct SparseAdamFunctor { moment2_out_[i] = mom2; param_out_[i] = p; } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + sparse_update(i, g); + } }; template @@ -240,6 +245,7 @@ class AdamOpKernel : public framework::OpKernel { using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; + bool sparse_mode = ctx.Attr("sparse_mode"); T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); @@ -351,11 +357,22 @@ class AdamOpKernel : public framework::OpKernel { mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size()); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + grad_merge.rows().size(), sparse_mode); + if (sparse_mode) { + size_t row_count = grad_merge.rows().size(); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = rows[row_index] * row_numel + offset; + T g = grad_data[row_index * row_numel + offset]; + functor.sparse_update(i, g); + } + } + } else { + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else { PADDLE_THROW("Variable type not supported by adam_op"); } diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index da92826d41..9c7482bc40 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -663,7 +663,8 @@ class AdamOptimizer(Optimizer): beta2=0.999, epsilon=1e-8, regularization=None, - name=None): + name=None, + sparse_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -676,6 +677,7 @@ class AdamOptimizer(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon + self._sparse_mode = sparse_mode def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -729,7 +731,8 @@ class AdamOptimizer(Optimizer): attrs={ "beta1": self._beta1, "beta2": self._beta2, - "epsilon": self._epsilon + "epsilon": self._epsilon, + "sparse_mode": self._sparse_mode }) return adam_op diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 5318d2f976..da91875a14 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -194,7 +194,8 @@ def adam_step(inputs, attributes): return param_out, moment1_out, moment2_out -def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): +def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, + sparse_mode): ''' Simulate one step of the adam optimizer :param inputs: dict of inputs @@ -230,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): class TestSparseAdamOp(unittest.TestCase): - def setup(self, scope, place): + def setup(self, scope, place, sparse_mode): beta1 = 0.78 beta2 = 0.836 epsilon = 1e-4 @@ -262,19 +263,21 @@ class TestSparseAdamOp(unittest.TestCase): self.sparse_inputs = ["Grad"] - param_out, mom1, mom2 = adam_step_sparse( - self.dense_inputs, self.attrs, height, rows, row_numel, np_array) + param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, + height, rows, row_numel, + np_array, sparse_mode) self.outputs = { "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } - def check_with_place(self, place): + def check_with_place(self, place, sparse_mode): scope = core.Scope() - self.setup(scope, place) + self.setup(scope, place, sparse_mode) op_args = dict() + op_args['sparse_mode'] = sparse_mode for key, np_array in self.dense_inputs.items(): var = scope.var(key).get_tensor() var.set(np_array, place) @@ -305,12 +308,13 @@ class TestSparseAdamOp(unittest.TestCase): 0.00001) j += 1 - def test_sparse_sgd(self): + def test_sparse_adam(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: - self.check_with_place(place) + for sparse_mode in (True, False): + self.check_with_place(place, sparse_mode) if __name__ == "__main__": From f0df62f136396794556a121344a719e4c6fb62ef Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 13 Dec 2018 09:44:40 +0000 Subject: [PATCH 299/719] add more unittest case test=develop --- paddle/fluid/operators/py_func_op.cc | 33 +++++++++++------- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/layers/nn.py | 34 +++++++++++++------ .../fluid/tests/unittests/test_py_func_op.py | 17 +++++++++- 4 files changed, 60 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 5d1aa7d7e6..1bee3d9351 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -35,6 +35,9 @@ size_t AppendPythonCallableObjectAndReturnId(const py::object &py_obj) { return g_py_callables.size() - 1; } +// Return py::object* instead of py::object +// Returning py::object would cause reference count increasing +// but without GIL, reference count in Python may not be safe static py::object *GetPythonCallableObject(size_t i) { PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id"); return &g_py_callables[i]; @@ -47,7 +50,7 @@ static std::string PythonObjectToString(const py::object &py_callable) { static void CallPythonFunc(py::object *callable, const std::vector &ins, - std::vector *out) { + std::vector *outs) { py::gil_scoped_acquire guard; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { @@ -57,8 +60,8 @@ static void CallPythonFunc(py::object *callable, auto ret = (*callable)(*in_args); auto ret_tuple = py::cast(ret); size_t ret_num = py::len(ret_tuple); - size_t out_num = out->size(); - if (ret_num != out_num) { + size_t out_num = outs->size(); + if (UNLIKELY(ret_num != out_num)) { // Python function has no return values or returns None // In this case, ret_num = 1 && ret[0] == None && out_num should be 0 // Otherwise, ret_num must be equal to out_num @@ -69,17 +72,18 @@ static void CallPythonFunc(py::object *callable, } for (size_t i = 0; i < out_num; ++i) { - if ((*out)[i] == nullptr) { + auto *out = (*outs)[i]; + if (out == nullptr) { continue; } try { - auto *out_tensor = py::cast(ret_tuple[i]); - PADDLE_ENFORCE_NOT_NULL(out_tensor, + auto *py_out_tensor = py::cast(ret_tuple[i]); + PADDLE_ENFORCE_NOT_NULL(py_out_tensor, "Output tensor %d should not be nullptr", i); - (*out)[i]->set_lod(out_tensor->lod()); - (*out)[i]->ShareDataWith(*out_tensor); + out->set_lod(py_out_tensor->lod()); + out->ShareDataWith(*py_out_tensor); } catch (py::cast_error &) { - PADDLE_THROW("Output %d is not LoDTensor", i); + PADDLE_THROW("The %d-th output must be LoDTensor", i); } } } @@ -94,6 +98,10 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { PADDLE_ENFORCE_GE(ctx->Attrs().Get(kForwardPythonCallableId), 0, "Function id cannot be less than 0"); + // Transverse all outputs + // If name of any output ends with @GRAD, + // set its shape, dtype, lod_level, type to be the same as + // the correponding forward variable auto *op = boost::get(ctx->GetOp()); auto *block = op->Block(); const std::string kGradVarSuffix = framework::kGradVarSuffix; @@ -115,7 +123,7 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { auto *in_var_desc = block->FindVarRecursive(fwd_var_name); PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found", fwd_var_name); - VLOG(10) << "Infer shape of Out(" << out_name << ") as Input(" + VLOG(10) << "Infer shape of Output(" << out_name << ") as Input(" << in_var_desc->Name() << ")"; out_var_desc->SetShape(in_var_desc->GetShape()); out_var_desc->SetDataType(in_var_desc->GetDataType()); @@ -135,7 +143,7 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { "Index of registered forward Python function.") .SetDefault(0); AddAttr(kBackwardPythonCallableId, - "Index of registered backward Python function") + "Index of registered backward Python function.") .SetDefault(-1); AddAttr>(kPyFuncBackwardSkipVars, "Unused forward in/out in backward op") @@ -170,8 +178,7 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { auto fwd_outs = Output("Out"); // For memory reused, some inputs/output in forward part may be not needed - // in backward part - // Just skip these vars + // in backward part. Skipping these vars helps to save memory auto &backward_skip_var_list = boost::get>( fwd_attrs.at(kPyFuncBackwardSkipVars)); std::unordered_set backward_skip_var_set( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 348a073915..208efbea4a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -104,7 +104,7 @@ PYBIND11_MODULE(core, m) { BindException(&m); m.def( - "append_python_callable_object_and_return_id", + "_append_python_callable_object_and_return_id", [](py::object py_obj) -> size_t { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index db7ec9d021..3cd0a2887e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9137,8 +9137,13 @@ class PyFuncRegistry(object): self._func = func # find named args using reflection - self._named_args = inspect.getargspec(self._func)[0] - self._id = core.append_python_callable_object_and_return_id(self) + args = inspect.getargspec(self._func) + if len(args[0]) == 0 and args[1] is None and args[2] is None: + # Function with no inputs + self._named_args = None + else: + self._named_args = args[0] + self._id = core._append_python_callable_object_and_return_id(self) ''' Why record self here? @@ -9168,13 +9173,16 @@ class PyFuncRegistry(object): return self._id def __call__(self, *args): - kwargs = dict() - idx = 0 - for arg in self._named_args: - kwargs[arg] = args[idx] - idx += 1 + if self._named_args is None: + func_ret = self._func() + else: + kwargs = dict() + idx = 0 + for arg in self._named_args: + kwargs[arg] = args[idx] + idx += 1 + func_ret = self._func(*args[idx:], **kwargs) - func_ret = self._func(*args[idx:], **kwargs) if not isinstance(func_ret, (list, tuple)): func_ret = (func_ret, ) @@ -9207,14 +9215,18 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): User should set the right data type and shape of :code:`out` before calling this function. However, data types and shapes of gradients of - :code:`out` and :code:`x` would be infered automatically. + :code:`out` and :code:`x` would be inferred automatically. - The orders of inputs of :code:`backward_func` would be: forward input - :code:`x`, forward output :code:`out` and backward input gradient of + Input orders of :code:`backward_func` would be: forward inputs + :code:`x`, forward outputs :code:`out` and backward input gradients of :code:`out`. If some variables of :code:`out` have no gradient, the input tensor would be None in Python side. If some variables of :code:`in` have no gradient, users should return None. + This function can also be used to debug the running network. User can + add a :code:`py_func` operator without output, and print input + :code:`x` inside :code:`func`. + Args: func (callable): forward Python function. x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`. diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 491bbc2190..943ad3ed22 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -25,6 +25,14 @@ if fluid.core.is_compiled_with_cuda(): os.environ['CPU_NUM'] = str(dev_cnt) +def dummy_func_with_no_input(): + return float(1.0) + + +def dummy_func_with_no_output(x): + pass + + def tanh(x): return np.tanh(x) @@ -86,13 +94,20 @@ def simple_fc_net(img, label, use_py_func_op): else: loss = fluid.default_main_program().current_block().create_var( name='loss', dtype='float32', shape=[-1, 1]) - fluid.layers.py_func( + loss = fluid.layers.py_func( func=cross_entropy, x=[prediction, label], out=loss, backward_func=cross_entropy_grad, skip_vars_in_backward_input=loss) + dummy_var = fluid.default_main_program().current_block().create_var( + name='test_tmp_var', dtype='float32', shape=[1]) + fluid.layers.py_func( + func=dummy_func_with_no_input, x=None, out=dummy_var) + + fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None) + loss = fluid.layers.mean(loss) return loss From ad6ae0b071041c1f69c66c7c173733bfe7cb2752 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:39:46 +0800 Subject: [PATCH 300/719] 1. Add SpinLock 2. Seperate the lock of kids and vars in Scope test=develop --- CMakeLists.txt | 1 + cmake/external/robin_map.cmake | 31 +++++++ .../framework/details/execution_strategy.h | 2 +- .../scope_buffered_ssa_graph_executor.cc | 9 +- paddle/fluid/framework/operator.cc | 6 +- paddle/fluid/framework/rw_lock.h | 91 +++++-------------- paddle/fluid/framework/scope.cc | 58 ++++++------ paddle/fluid/framework/scope.h | 15 ++- paddle/fluid/framework/spin_lock.h | 71 +++++++++++++++ paddle/fluid/operators/optimizers/adam_op.h | 17 ---- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/optimizer.py | 43 +++++---- 12 files changed, 201 insertions(+), 145 deletions(-) create mode 100644 cmake/external/robin_map.cmake create mode 100644 paddle/fluid/framework/spin_lock.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e59aca2d9..2abbcef41a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,6 +215,7 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/robin_map) # download tsl::robin_map if (NOT WIN32) # there is no official support of warpctc, nccl, cupti in windows diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake new file mode 100644 index 0000000000..ddaf59536c --- /dev/null +++ b/cmake/external/robin_map.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map) +set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include) + +include_directories(${ROBIN_MAP_INCLUDE_DIR}) + +ExternalProject_Add( + extern_robin_map + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Tessil/robin-map.git" + GIT_TAG "v0.5.0" + PREFIX ${ROBIN_MAP_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(robin_map STATIC ${dummyfile}) +else() + add_library(robin_map INTERFACE) +endif() + +add_dependencies(robin_map extern_robin_map) + +LIST(APPEND externl_project_dependencies robin_map) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c..37b07e5736 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{100}; + size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 499246a985..9ded0266a9 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -76,9 +76,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( : nullptr; #endif - if (!fetch_tensors.empty() || - drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - drop_scope_counter_ = 0; + if (!fetch_tensors.empty()) { // Wait All computational streams for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); @@ -91,12 +89,17 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } #endif } + } + + if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + drop_scope_counter_ = 0; for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } } + if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f..58e5926f54 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -163,11 +163,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { } bool OperatorBase::HasInputs(const std::string& name) const { - if (inputs_.find(name) != inputs_.end()) { - return true; - } else { - return false; - } + return inputs_.find(name) != inputs_.end(); } std::string OperatorBase::Input(const std::string& name) const { diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index dd918fcdfa..75e6bef9bf 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -31,17 +31,17 @@ struct RWLock { ~RWLock() { pthread_rwlock_destroy(&lock_); } - void RDLock() { + inline void RDLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); } - void WRLock() { + inline void WRLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); } - void UNLock() { + inline void UNLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); } @@ -54,86 +54,43 @@ struct RWLock { // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { // FIXME(minqiyang): use mutex here to do fake lock - void RDLock() { mutex_.lock(); } + inline void RDLock() { mutex_.lock(); } - void WRLock() { mutex_.lock(); } + inline void WRLock() { mutex_.lock(); } - void UNLock() { mutex_.unlock(); } + inline void UNLock() { mutex_.unlock(); } private: std::mutex mutex_; }; #endif -class RWLockGuard { +class AutoWRLock { public: - enum Status { kUnLock, kWRLock, kRDLock }; - - RWLockGuard(RWLock* rw_lock, Status init_status) - : lock_(rw_lock), status_(Status::kUnLock) { - switch (init_status) { - case Status::kRDLock: { - RDLock(); - break; - } - case Status::kWRLock: { - WRLock(); - break; - } - case Status::kUnLock: { - break; - } - } - } + explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - void WRLock() { - switch (status_) { - case Status::kUnLock: { - lock_->WRLock(); - status_ = Status::kWRLock; - break; - } - case Status::kWRLock: { - break; - } - case Status::kRDLock: { - PADDLE_THROW( - "Please unlock read lock first before invoking write lock."); - break; - } - } - } + inline void Lock() { lock_->WRLock(); } - void RDLock() { - switch (status_) { - case Status::kUnLock: { - lock_->RDLock(); - status_ = Status::kRDLock; - break; - } - case Status::kRDLock: { - break; - } - case Status::kWRLock: { - PADDLE_THROW( - "Please unlock write lock first before invoking read lock."); - break; - } - } - } + inline void UnLock() { lock_->UNLock(); } - void UnLock() { - if (status_ != Status::kUnLock) { - lock_->UNLock(); - status_ = Status::kUnLock; - } - } + ~AutoWRLock() { UnLock(); } + + private: + RWLock* lock_; +}; + +class AutoRDLock { + public: + explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + inline void Lock() { lock_->RDLock(); } + + inline void UnLock() { lock_->UNLock(); } - ~RWLockGuard() { UnLock(); } + ~AutoRDLock() { UnLock(); } private: RWLock* lock_; - Status status_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 190a057d9e..f05208c5ec 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include #include "glog/logging.h" -#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(benchmark, false, @@ -43,13 +42,15 @@ DEFINE_double( // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_READER_LOCK -#define SCOPE_WRITER_LOCK +#define SCOPE_KIDS_READER_LOCK +#define SCOPE_KIDS_WRITER_LOCK +#define SCOPE_VARS_READER_LOCK +#define SCOPE_VARS_WRITER_LOCK #else -// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one -// in _WIN32 platform -#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock); -#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock); +#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_); #endif namespace paddle { @@ -65,64 +66,69 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_WRITER_LOCK - kids_.push_back(new Scope(this)); - return *kids_.back(); + Scope* child = new Scope(this); + { + SCOPE_KIDS_WRITER_LOCK + kids_.push_back(child); + } + return *child; } Variable* Scope::Var(const std::string& name) { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; } + SCOPE_VARS_WRITER_LOCK return VarInternal(new_name); } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_WRITER_LOCK + SCOPE_KIDS_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_READER_LOCK + SCOPE_KIDS_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_READER_LOCK std::vector known_vars; - known_vars.reserve(this->vars_.size()); - for (auto& p : vars_) { - known_vars.emplace_back(p.first); + { + SCOPE_VARS_READER_LOCK + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } } return known_vars; } void Scope::DeleteScope(Scope* scope) const { - SCOPE_WRITER_LOCK + SCOPE_KIDS_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -136,8 +142,8 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_WRITER_LOCK std::set var_set(var_names.begin(), var_names.end()); + SCOPE_VARS_WRITER_LOCK for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { it = vars_.erase(it); @@ -149,12 +155,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; @@ -188,7 +194,7 @@ void Scope::RenameInternal(const std::string& origin_name, auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name].reset(origin_it->second.release()); + vars_[new_name].reset(origin_it.value().release()); vars_.erase(origin_it); } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index c140212c3e..78ad8be500 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,11 +14,15 @@ limitations under the License. */ #pragma once +#include #include +#include #include -#include +#include #include +#include // NOLINT + #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -94,7 +98,11 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map> vars_; + mutable tsl::robin_map< + std::string, std::unique_ptr, std::hash, + std::equal_to, + std::allocator>>, true> + vars_; private: // Call Scope::NewScope for a sub-scope. @@ -123,7 +131,8 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable RWLock rw_lock_; + mutable RWLock kids_lock_; + mutable RWLock vars_lock_; }; // Generate some debug string about the inherience structure of scope, quite diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h new file mode 100644 index 0000000000..11a763d655 --- /dev/null +++ b/paddle/fluid/framework/spin_lock.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if !defined(_WIN32) +#include +#else +#include // NOLINT +#endif // !_WIN32 + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +#if !defined(_WIN32) +struct SpinLock { + SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); } + + ~SpinLock() { pthread_spin_destroy(&lock_); } + + void Lock() { + PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed"); + } + + void Unlock() { + PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0, + "release spin lock failed"); + } + + private: + pthread_spinlock_t lock_; +}; +#else +// FIXME(minqiyang): use mutex here to do fake spin lock +struct SpinLock { + void Lock() { mutex_.lock(); } + + void Unlock() { mutex_.lock(); } + + private: + std::mutex mutex_; +}; +#endif + +class AutoSpinLock { + public: + explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) { + lock_->Lock(); + } + + ~SpinLockGuard() { lock_->Unlock(); } + + private: + SpinLock* lock_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 2205f473f2..3455d1ee54 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -292,23 +292,6 @@ class AdamOpKernel : public framework::OpKernel { static_cast(ctx.device_context()), param.numel()); for_range(functor); - - auto& dev = - *ctx.template device_context().eigen_device(); - - const LoDTensor* beta1_pow_ptr = ctx.Input("Beta1Pow"); - auto eigen_in_beta1_pow = - framework::EigenVector::Flatten(*beta1_pow_ptr); - auto eigen_out_beta1_pow = framework::EigenVector::Flatten( - *(const_cast(beta1_pow_ptr))); - eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow; - - const LoDTensor* beta2_pow_ptr = ctx.Input("Beta2Pow"); - auto eigen_in_beta2_pow = - framework::EigenVector::Flatten(*beta2_pow_ptr); - auto eigen_out_beta2_pow = framework::EigenVector::Flatten( - *(const_cast(beta2_pow_ptr))); - eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow; } } else if (grad_var->IsType()) { auto& grad = diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b2..f831f2313e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -765,7 +765,7 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is INT, num_iteration_per_drop_scope indicates how many iterations to clean up the temp variables which is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. Default 100. + because the temp variable's shape maybe the same between two iterations. Default 1. NOTES: 1. If you fetch data when calling the 'run', the ParallelExecutor diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1930ac106b..da92826d41 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python @@ -739,27 +739,26 @@ class AdamOptimizer(Optimizer): """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - # for param, grad in param_and_grads: - - # if grad is None: - # continue - # with param.block.program._optimized_guard( - # [param, grad]), name_scope("optimizer"): - # beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - # param) - # beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - # param) - # main_block.append_op( - # type="scale", - # inputs={"X": beta1_pow_acc}, - # outputs={"Out": beta1_pow_acc}, - # attrs={"scale": self._beta1}) - - # main_block.append_op( - # type="scale", - # inputs={"X": beta2_pow_acc}, - # outputs={"Out": beta2_pow_acc}, - # attrs={"scale": self._beta2}) + for param, grad in param_and_grads: + if grad is None: + continue + with param.block.program._optimized_guard( + [param, grad]), name_scope("optimizer"): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param) + main_block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}) + + main_block.append_op( + type="scale", + inputs={"X": beta2_pow_acc}, + outputs={"Out": beta2_pow_acc}, + attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): From a81495d6f4a71980b51cc3099f8cd76885cdcb13 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:45:20 +0800 Subject: [PATCH 301/719] Fix code --- paddle/fluid/framework/scope.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index f05208c5ec..d2856a07a1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(benchmark, false, @@ -47,10 +48,10 @@ DEFINE_double( #define SCOPE_VARS_READER_LOCK #define SCOPE_VARS_WRITER_LOCK #else -#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_); +#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); #endif namespace paddle { From 19a798018f82b9eaa31aa8d84f8aa4306bbf8973 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:51:28 +0800 Subject: [PATCH 302/719] Remove dup cmake test=develop --- CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bf724e8aa9..1b2e0ecf6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,12 +81,6 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) -if (WITH_PROFILER) - find_package(Gperftools REQUIRED) - include_directories(${GPERFTOOLS_INCLUDE_DIR}) - add_definitions(-DWITH_GPERFTOOLS) -endif() - # PY_VERSION if(NOT PY_VERSION) set(PY_VERSION 2.7) From d839bd0dd4ffecaa061aed32684a1a0b09f28d30 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 19:14:35 +0800 Subject: [PATCH 303/719] simple commit --- paddle/fluid/framework/async_executor.h | 52 +++++++++++-------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 1264212641..a82e941559 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -17,13 +17,13 @@ limitations under the License. */ #include #include #include -#include // NOLINT +#include // NOLINT +#include // local_random_engine #include #include #include // NOLINT #include #include -#include // local_random_engine #include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_thread_worker.h" @@ -34,24 +34,23 @@ namespace paddle { namespace framework { inline double current_realtime() { - struct timespec tp; - clock_gettime(CLOCK_REALTIME, &tp); - return tp.tv_sec + tp.tv_nsec * 1e-9; + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; } inline std::default_random_engine& local_random_engine() { - struct engine_wrapper_t { - std::default_random_engine engine; - engine_wrapper_t() { - static std::atomic x(0); - std::seed_seq sseq = {x++, x++, x++, - static_cast( - current_realtime() * 1000)}; - engine.seed(sseq); - } - }; - thread_local engine_wrapper_t r; - return r.engine; + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, + static_cast(current_realtime() * 1000)}; + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; } class AsyncExecutor { @@ -63,14 +62,12 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, - const std::string& mode, - const bool debug = false); + const std::string& mode, const bool debug = false); #ifdef PADDLE_WITH_PSLIB void InitServer(const std::string& dist_desc, int index); - void InitWorker( - const std::string& dist_desc, - const std::vector& host_sign_list, - int node_num, int index); + void InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, int node_num, + int index); uint64_t StartServer(); void StopServer(); void GatherServers(const std::vector& host_sign_list, int node_num); @@ -92,19 +89,16 @@ class AsyncExecutor { public: #ifdef PADDLE_WITH_PSLIB - std::shared_ptr _pslib_ptr; - std::shared_ptr _pull_dense_thread; + std::shared_ptr _pslib_ptr; + std::shared_ptr _pull_dense_thread; AsyncWorkerParamConfig _param_config; #endif Scope* root_scope_; platform::Place place_; - + private: int actual_thread_num; - }; - - } // namespace framework } // namespace paddle From f6c30863295de7d3a989c21d7d8e1427c888c301 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 19:44:46 +0800 Subject: [PATCH 304/719] add copy rigth checker to ps_pb2.py --- python/paddle/fluid/distributed/ps_pb2.py | 3490 +++++++++++++-------- 1 file changed, 2140 insertions(+), 1350 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index 978b18d0d5..0d226c4d59 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -1,8 +1,20 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # Generated by the protocol buffer compiler. DO NOT EDIT! # source: ps.proto import sys -_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1')) from google.protobuf.internal import enum_type_wrapper from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message @@ -13,104 +25,115 @@ from google.protobuf import descriptor_pb2 _sym_db = _symbol_database.Default() - - - DESCRIPTOR = _descriptor.FileDescriptor( - name='ps.proto', - package='paddle', - syntax='proto2', - serialized_pb=_b('\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01') -) + name='ps.proto', + package='paddle', + syntax='proto2', + serialized_pb=_b( + '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01' + )) _sym_db.RegisterFileDescriptor(DESCRIPTOR) _TABLETYPE = _descriptor.EnumDescriptor( - name='TableType', - full_name='paddle.TableType', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='PS_SPARSE_TABLE', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_DENSE_TABLE', index=1, number=1, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=3286, - serialized_end=3338, -) + name='TableType', + full_name='paddle.TableType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3286, + serialized_end=3338, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE) _PSCMDID = _descriptor.EnumDescriptor( - name='PsCmdID', - full_name='paddle.PsCmdID', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='PS_PULL_DENSE_TABLE', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PUSH_DENSE_TABLE', index=1, number=1, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PULL_SPARSE_TABLE', index=2, number=2, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PUSH_SPARSE_TABLE', index=3, number=3, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_SHRINK_TABLE', index=4, number=4, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_SAVE_ONE_TABLE', index=5, number=5, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_SAVE_ALL_TABLE', index=6, number=6, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_LOAD_ONE_TABLE', index=7, number=7, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_LOAD_ALL_TABLE', index=8, number=8, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_CLEAR_ONE_TABLE', index=9, number=9, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_CLEAR_ALL_TABLE', index=10, number=10, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_PUSH_DENSE_PARAM', index=11, number=11, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PS_STOP_SERVER', index=12, number=12, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=3341, - serialized_end=3658, -) + name='PsCmdID', + full_name='paddle.PsCmdID', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_PULL_DENSE_TABLE', + index=0, + number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_TABLE', + index=1, + number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PULL_SPARSE_TABLE', + index=2, + number=2, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_SPARSE_TABLE', + index=3, + number=3, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ONE_TABLE', + index=5, + number=5, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ALL_TABLE', + index=6, + number=6, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ONE_TABLE', + index=7, + number=7, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ALL_TABLE', + index=8, + number=8, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ONE_TABLE', + index=9, + number=9, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ALL_TABLE', + index=10, + number=10, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_PARAM', + index=11, + number=11, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_STOP_SERVER', index=12, number=12, options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3341, + serialized_end=3658, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID) @@ -130,1377 +153,2144 @@ PS_CLEAR_ALL_TABLE = 10 PS_PUSH_DENSE_PARAM = 11 PS_STOP_SERVER = 12 - _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( - name='FsApiType', - full_name='paddle.FsClientParameter.FsApiType', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='HDFS', index=0, number=0, - options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='AFS', index=1, number=1, - options=None, - type=None), - ], - containing_type=None, - options=None, - serialized_start=3254, - serialized_end=3284, -) + name='FsApiType', + full_name='paddle.FsClientParameter.FsApiType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='HDFS', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='AFS', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3254, + serialized_end=3284, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) - _PSPARAMETER = _descriptor.Descriptor( - name='PSParameter', - full_name='paddle.PSParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='worker_class', full_name='paddle.PSParameter.worker_class', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='server_class', full_name='paddle.PSParameter.server_class', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='instance_class', full_name='paddle.PSParameter.instance_class', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='worker_param', full_name='paddle.PSParameter.worker_param', index=3, - number=101, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='server_param', full_name='paddle.PSParameter.server_param', index=4, - number=102, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='trainer_param', full_name='paddle.PSParameter.trainer_param', index=5, - number=301, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='fs_client_param', full_name='paddle.PSParameter.fs_client_param', index=6, - number=501, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=21, - serialized_end=307, -) - + name='PSParameter', + full_name='paddle.PSParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='worker_class', + full_name='paddle.PSParameter.worker_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.PSParameter.server_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='instance_class', + full_name='paddle.PSParameter.instance_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='worker_param', + full_name='paddle.PSParameter.worker_param', + index=3, + number=101, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_param', + full_name='paddle.PSParameter.server_param', + index=4, + number=102, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='trainer_param', + full_name='paddle.PSParameter.trainer_param', + index=5, + number=301, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', + full_name='paddle.PSParameter.fs_client_param', + index=6, + number=501, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=21, + serialized_end=307, ) _WORKERPARAMETER = _descriptor.Descriptor( - name='WorkerParameter', - full_name='paddle.WorkerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_worker_param', full_name='paddle.WorkerParameter.downpour_worker_param', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=309, - serialized_end=390, -) - + name='WorkerParameter', + full_name='paddle.WorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_worker_param', + full_name='paddle.WorkerParameter.downpour_worker_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=309, + serialized_end=390, ) _SERVERPARAMETER = _descriptor.Descriptor( - name='ServerParameter', - full_name='paddle.ServerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_server_param', full_name='paddle.ServerParameter.downpour_server_param', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=392, - serialized_end=473, -) - + name='ServerParameter', + full_name='paddle.ServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_server_param', + full_name='paddle.ServerParameter.downpour_server_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=392, + serialized_end=473, ) _DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( - name='DownpourWorkerParameter', - full_name='paddle.DownpourWorkerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_table_param', full_name='paddle.DownpourWorkerParameter.downpour_table_param', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=475, - serialized_end=554, -) - + name='DownpourWorkerParameter', + full_name='paddle.DownpourWorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourWorkerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=475, + serialized_end=554, ) _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( - name='DownpourTrainerParameter', - full_name='paddle.DownpourTrainerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='dense_table', full_name='paddle.DownpourTrainerParameter.dense_table', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='sparse_table', full_name='paddle.DownpourTrainerParameter.sparse_table', index=1, - number=2, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='push_sparse_per_batch', full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', index=2, - number=3, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='push_dense_per_batch', full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', index=3, - number=4, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='skip_op', full_name='paddle.DownpourTrainerParameter.skip_op', index=4, - number=5, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=557, - serialized_end=763, -) - + name='DownpourTrainerParameter', + full_name='paddle.DownpourTrainerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='dense_table', + full_name='paddle.DownpourTrainerParameter.dense_table', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_table', + full_name='paddle.DownpourTrainerParameter.sparse_table', + index=1, + number=2, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_sparse_per_batch', + full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', + index=2, + number=3, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_per_batch', + full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='skip_op', + full_name='paddle.DownpourTrainerParameter.skip_op', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=557, + serialized_end=763, ) _DENSETABLEPARAMETER = _descriptor.Descriptor( - name='DenseTableParameter', - full_name='paddle.DenseTableParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.DenseTableParameter.table_id', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='dense_variable_name', full_name='paddle.DenseTableParameter.dense_variable_name', index=1, - number=2, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='dense_gradient_variable_name', full_name='paddle.DenseTableParameter.dense_gradient_variable_name', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='fea_dim', full_name='paddle.DenseTableParameter.fea_dim', index=3, - number=4, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=765, - serialized_end=888, -) - + name='DenseTableParameter', + full_name='paddle.DenseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.DenseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_variable_name', + full_name='paddle.DenseTableParameter.dense_variable_name', + index=1, + number=2, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_gradient_variable_name', + full_name='paddle.DenseTableParameter.dense_gradient_variable_name', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.DenseTableParameter.fea_dim', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=765, + serialized_end=888, ) _SPARSETABLEPARAMETER = _descriptor.Descriptor( - name='SparseTableParameter', - full_name='paddle.SparseTableParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.SparseTableParameter.table_id', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='feature_dim', full_name='paddle.SparseTableParameter.feature_dim', index=1, - number=2, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='slot_key', full_name='paddle.SparseTableParameter.slot_key', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='slot_value', full_name='paddle.SparseTableParameter.slot_value', index=3, - number=4, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='slot_gradient', full_name='paddle.SparseTableParameter.slot_gradient', index=4, - number=5, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=890, - serialized_end=1012, -) - + name='SparseTableParameter', + full_name='paddle.SparseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.SparseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='feature_dim', + full_name='paddle.SparseTableParameter.feature_dim', + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_key', + full_name='paddle.SparseTableParameter.slot_key', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_value', + full_name='paddle.SparseTableParameter.slot_value', + index=3, + number=4, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_gradient', + full_name='paddle.SparseTableParameter.slot_gradient', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=890, + serialized_end=1012, ) _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( - name='DownpourServerParameter', - full_name='paddle.DownpourServerParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='downpour_table_param', full_name='paddle.DownpourServerParameter.downpour_table_param', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='service_param', full_name='paddle.DownpourServerParameter.service_param', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1015, - serialized_end=1149, -) - + name='DownpourServerParameter', + full_name='paddle.DownpourServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourServerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_param', + full_name='paddle.DownpourServerParameter.service_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1015, + serialized_end=1149, ) _SERVERSERVICEPARAMETER = _descriptor.Descriptor( - name='ServerServiceParameter', - full_name='paddle.ServerServiceParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='server_class', full_name='paddle.ServerServiceParameter.server_class', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("DownpourBrpcPsServer").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='client_class', full_name='paddle.ServerServiceParameter.client_class', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("DownpourBrpcPsClient").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='service_class', full_name='paddle.ServerServiceParameter.service_class', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=_b("DownpourPsService").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='start_server_port', full_name='paddle.ServerServiceParameter.start_server_port', index=3, - number=4, type=13, cpp_type=3, label=1, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='server_thread_num', full_name='paddle.ServerServiceParameter.server_thread_num', index=4, - number=5, type=13, cpp_type=3, label=1, - has_default_value=True, default_value=12, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1152, - serialized_end=1367, -) - + name='ServerServiceParameter', + full_name='paddle.ServerServiceParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.ServerServiceParameter.server_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsServer").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_class', + full_name='paddle.ServerServiceParameter.client_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsClient").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_class', + full_name='paddle.ServerServiceParameter.service_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourPsService").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='start_server_port', + full_name='paddle.ServerServiceParameter.start_server_port', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_thread_num', + full_name='paddle.ServerServiceParameter.server_thread_num', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=12, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1152, + serialized_end=1367, ) _TABLEPARAMETER = _descriptor.Descriptor( - name='TableParameter', - full_name='paddle.TableParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.TableParameter.table_id', index=0, - number=1, type=4, cpp_type=4, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='table_class', full_name='paddle.TableParameter.table_class', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='shared_num', full_name='paddle.TableParameter.shared_num', index=2, - number=3, type=4, cpp_type=4, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='accessor', full_name='paddle.TableParameter.accessor', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='type', full_name='paddle.TableParameter.type', index=4, - number=5, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='compress_in_save', full_name='paddle.TableParameter.compress_in_save', index=5, - number=6, type=8, cpp_type=7, label=1, - has_default_value=True, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1370, - serialized_end=1561, -) - + name='TableParameter', + full_name='paddle.TableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.TableParameter.table_id', + index=0, + number=1, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_class', + full_name='paddle.TableParameter.table_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='shared_num', + full_name='paddle.TableParameter.shared_num', + index=2, + number=3, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='accessor', + full_name='paddle.TableParameter.accessor', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='type', + full_name='paddle.TableParameter.type', + index=4, + number=5, + type=14, + cpp_type=8, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='compress_in_save', + full_name='paddle.TableParameter.compress_in_save', + index=5, + number=6, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1370, + serialized_end=1561, ) _TABLEACCESSORPARAMETER = _descriptor.Descriptor( - name='TableAccessorParameter', - full_name='paddle.TableAccessorParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='accessor_class', full_name='paddle.TableAccessorParameter.accessor_class', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='sparse_sgd_param', full_name='paddle.TableAccessorParameter.sparse_sgd_param', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='dense_sgd_param', full_name='paddle.TableAccessorParameter.dense_sgd_param', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='fea_dim', full_name='paddle.TableAccessorParameter.fea_dim', index=3, - number=4, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='embedx_dim', full_name='paddle.TableAccessorParameter.embedx_dim', index=4, - number=5, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='embedx_threshold', full_name='paddle.TableAccessorParameter.embedx_threshold', index=5, - number=6, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='downpour_accessor_param', full_name='paddle.TableAccessorParameter.downpour_accessor_param', index=6, - number=7, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='table_accessor_save_param', full_name='paddle.TableAccessorParameter.table_accessor_save_param', index=7, - number=8, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1564, - serialized_end=1933, -) - + name='TableAccessorParameter', + full_name='paddle.TableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='accessor_class', + full_name='paddle.TableAccessorParameter.accessor_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_sgd_param', + full_name='paddle.TableAccessorParameter.sparse_sgd_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_sgd_param', + full_name='paddle.TableAccessorParameter.dense_sgd_param', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.TableAccessorParameter.fea_dim', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_dim', + full_name='paddle.TableAccessorParameter.embedx_dim', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_threshold', + full_name='paddle.TableAccessorParameter.embedx_threshold', + index=5, + number=6, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='downpour_accessor_param', + full_name='paddle.TableAccessorParameter.downpour_accessor_param', + index=6, + number=7, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_accessor_save_param', + full_name='paddle.TableAccessorParameter.table_accessor_save_param', + index=7, + number=8, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1564, + serialized_end=1933, ) _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( - name='DownpourTableAccessorParameter', - full_name='paddle.DownpourTableAccessorParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='nonclk_coeff', full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', index=0, - number=1, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='click_coeff', full_name='paddle.DownpourTableAccessorParameter.click_coeff', index=1, - number=2, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='base_threshold', full_name='paddle.DownpourTableAccessorParameter.base_threshold', index=2, - number=3, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='delta_threshold', full_name='paddle.DownpourTableAccessorParameter.delta_threshold', index=3, - number=4, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='delta_keep_days', full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', index=4, - number=5, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='show_click_decay_rate', full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', index=5, - number=6, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='delete_threshold', full_name='paddle.DownpourTableAccessorParameter.delete_threshold', index=6, - number=7, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1936, - serialized_end=2142, -) - + name='DownpourTableAccessorParameter', + full_name='paddle.DownpourTableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='nonclk_coeff', + full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', + index=0, + number=1, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='click_coeff', + full_name='paddle.DownpourTableAccessorParameter.click_coeff', + index=1, + number=2, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='base_threshold', + full_name='paddle.DownpourTableAccessorParameter.base_threshold', + index=2, + number=3, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_threshold', + full_name='paddle.DownpourTableAccessorParameter.delta_threshold', + index=3, + number=4, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_keep_days', + full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', + index=4, + number=5, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='show_click_decay_rate', + full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', + index=5, + number=6, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delete_threshold', + full_name='paddle.DownpourTableAccessorParameter.delete_threshold', + index=6, + number=7, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1936, + serialized_end=2142, ) _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( - name='TableAccessorSaveParameter', - full_name='paddle.TableAccessorSaveParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='param', full_name='paddle.TableAccessorSaveParameter.param', index=0, - number=1, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='converter', full_name='paddle.TableAccessorSaveParameter.converter', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='deconverter', full_name='paddle.TableAccessorSaveParameter.deconverter', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2144, - serialized_end=2227, -) - + name='TableAccessorSaveParameter', + full_name='paddle.TableAccessorSaveParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='param', + full_name='paddle.TableAccessorSaveParameter.param', + index=0, + number=1, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='converter', + full_name='paddle.TableAccessorSaveParameter.converter', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='deconverter', + full_name='paddle.TableAccessorSaveParameter.deconverter', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2144, + serialized_end=2227, ) _PSREQUESTMESSAGE = _descriptor.Descriptor( - name='PsRequestMessage', - full_name='paddle.PsRequestMessage', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='cmd_id', full_name='paddle.PsRequestMessage.cmd_id', index=0, - number=1, type=13, cpp_type=3, label=2, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='table_id', full_name='paddle.PsRequestMessage.table_id', index=1, - number=2, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='params', full_name='paddle.PsRequestMessage.params', index=2, - number=3, type=12, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='client_id', full_name='paddle.PsRequestMessage.client_id', index=3, - number=4, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='data', full_name='paddle.PsRequestMessage.data', index=4, - number=5, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2229, - serialized_end=2330, -) - + name='PsRequestMessage', + full_name='paddle.PsRequestMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='cmd_id', + full_name='paddle.PsRequestMessage.cmd_id', + index=0, + number=1, + type=13, + cpp_type=3, + label=2, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.PsRequestMessage.table_id', + index=1, + number=2, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='params', + full_name='paddle.PsRequestMessage.params', + index=2, + number=3, + type=12, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_id', + full_name='paddle.PsRequestMessage.client_id', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsRequestMessage.data', + index=4, + number=5, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2229, + serialized_end=2330, ) _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( - name='SparseSGDRuleParameter', - full_name='paddle.SparseSGDRuleParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='learning_rate', full_name='paddle.SparseSGDRuleParameter.learning_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='initial_g2sum', full_name='paddle.SparseSGDRuleParameter.initial_g2sum', index=1, - number=2, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='initial_range', full_name='paddle.SparseSGDRuleParameter.initial_range', index=2, - number=3, type=1, cpp_type=5, label=1, - has_default_value=True, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='weight_bounds', full_name='paddle.SparseSGDRuleParameter.weight_bounds', index=3, - number=4, type=2, cpp_type=6, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2332, - serialized_end=2451, -) - + name='SparseSGDRuleParameter', + full_name='paddle.SparseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.SparseSGDRuleParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_g2sum', + full_name='paddle.SparseSGDRuleParameter.initial_g2sum', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_range', + full_name='paddle.SparseSGDRuleParameter.initial_range', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='weight_bounds', + full_name='paddle.SparseSGDRuleParameter.weight_bounds', + index=3, + number=4, + type=2, + cpp_type=6, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2332, + serialized_end=2451, ) _DENSESGDRULEPARAMETER = _descriptor.Descriptor( - name='DenseSGDRuleParameter', - full_name='paddle.DenseSGDRuleParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='paddle.DenseSGDRuleParameter.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='adam', full_name='paddle.DenseSGDRuleParameter.adam', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='naive', full_name='paddle.DenseSGDRuleParameter.naive', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='summary', full_name='paddle.DenseSGDRuleParameter.summary', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='moving_average', full_name='paddle.DenseSGDRuleParameter.moving_average', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2454, - serialized_end=2679, -) - + name='DenseSGDRuleParameter', + full_name='paddle.DenseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', + full_name='paddle.DenseSGDRuleParameter.name', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='adam', + full_name='paddle.DenseSGDRuleParameter.adam', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='naive', + full_name='paddle.DenseSGDRuleParameter.naive', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='summary', + full_name='paddle.DenseSGDRuleParameter.summary', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='moving_average', + full_name='paddle.DenseSGDRuleParameter.moving_average', + index=4, + number=5, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2454, + serialized_end=2679, ) _ADAMSGDPARAMETER = _descriptor.Descriptor( - name='AdamSGDParameter', - full_name='paddle.AdamSGDParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='learning_rate', full_name='paddle.AdamSGDParameter.learning_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='avg_decay_rate', full_name='paddle.AdamSGDParameter.avg_decay_rate', index=1, - number=2, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='ada_decay_rate', full_name='paddle.AdamSGDParameter.ada_decay_rate', index=2, - number=3, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='ada_epsilon', full_name='paddle.AdamSGDParameter.ada_epsilon', index=3, - number=4, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='mom_decay_rate', full_name='paddle.AdamSGDParameter.mom_decay_rate', index=4, - number=5, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2682, - serialized_end=2816, -) - + name='AdamSGDParameter', + full_name='paddle.AdamSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.AdamSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.AdamSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_decay_rate', + full_name='paddle.AdamSGDParameter.ada_decay_rate', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_epsilon', + full_name='paddle.AdamSGDParameter.ada_epsilon', + index=3, + number=4, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mom_decay_rate', + full_name='paddle.AdamSGDParameter.mom_decay_rate', + index=4, + number=5, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2682, + serialized_end=2816, ) _NAIVESGDPARAMETER = _descriptor.Descriptor( - name='NaiveSGDParameter', - full_name='paddle.NaiveSGDParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='learning_rate', full_name='paddle.NaiveSGDParameter.learning_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='avg_decay_rate', full_name='paddle.NaiveSGDParameter.avg_decay_rate', index=1, - number=2, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2818, - serialized_end=2884, -) - + name='NaiveSGDParameter', + full_name='paddle.NaiveSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.NaiveSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.NaiveSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2818, + serialized_end=2884, ) _SUMMARYSGDPARAMETER = _descriptor.Descriptor( - name='SummarySGDParameter', - full_name='paddle.SummarySGDParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='summary_decay_rate', full_name='paddle.SummarySGDParameter.summary_decay_rate', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=True, default_value=float(0.999999), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2886, - serialized_end=2945, -) - + name='SummarySGDParameter', + full_name='paddle.SummarySGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='summary_decay_rate', + full_name='paddle.SummarySGDParameter.summary_decay_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0.999999), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2886, + serialized_end=2945, ) _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( - name='MovingAverageRuleParameter', - full_name='paddle.MovingAverageRuleParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='momentum', full_name='paddle.MovingAverageRuleParameter.momentum', index=0, - number=1, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2947, - serialized_end=2993, -) - + name='MovingAverageRuleParameter', + full_name='paddle.MovingAverageRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='momentum', + full_name='paddle.MovingAverageRuleParameter.momentum', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2947, + serialized_end=2993, ) _PSRESPONSEMESSAGE = _descriptor.Descriptor( - name='PsResponseMessage', - full_name='paddle.PsResponseMessage', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='err_code', full_name='paddle.PsResponseMessage.err_code', index=0, - number=1, type=5, cpp_type=1, label=2, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='err_msg', full_name='paddle.PsResponseMessage.err_msg', index=1, - number=2, type=9, cpp_type=9, label=2, - has_default_value=True, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='data', full_name='paddle.PsResponseMessage.data', index=2, - number=3, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2995, - serialized_end=3068, -) - + name='PsResponseMessage', + full_name='paddle.PsResponseMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='err_code', + full_name='paddle.PsResponseMessage.err_code', + index=0, + number=1, + type=5, + cpp_type=1, + label=2, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='err_msg', + full_name='paddle.PsResponseMessage.err_msg', + index=1, + number=2, + type=9, + cpp_type=9, + label=2, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsResponseMessage.data', + index=2, + number=3, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2995, + serialized_end=3068, ) _FSCLIENTPARAMETER = _descriptor.Descriptor( - name='FsClientParameter', - full_name='paddle.FsClientParameter', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='fs_type', full_name='paddle.FsClientParameter.fs_type', index=0, - number=1, type=14, cpp_type=8, label=1, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='uri', full_name='paddle.FsClientParameter.uri', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='user', full_name='paddle.FsClientParameter.user', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='passwd', full_name='paddle.FsClientParameter.passwd', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='buffer_size', full_name='paddle.FsClientParameter.buffer_size', index=4, - number=5, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='hadoop_bin', full_name='paddle.FsClientParameter.hadoop_bin', index=5, - number=51, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - _descriptor.FieldDescriptor( - name='afs_conf', full_name='paddle.FsClientParameter.afs_conf', index=6, - number=101, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - options=None), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - _FSCLIENTPARAMETER_FSAPITYPE, - ], - options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=3071, - serialized_end=3284, -) + name='FsClientParameter', + full_name='paddle.FsClientParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='fs_type', + full_name='paddle.FsClientParameter.fs_type', + index=0, + number=1, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uri', + full_name='paddle.FsClientParameter.uri', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='user', + full_name='paddle.FsClientParameter.user', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='passwd', + full_name='paddle.FsClientParameter.passwd', + index=3, + number=4, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='buffer_size', + full_name='paddle.FsClientParameter.buffer_size', + index=4, + number=5, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='hadoop_bin', + full_name='paddle.FsClientParameter.hadoop_bin', + index=5, + number=51, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='afs_conf', + full_name='paddle.FsClientParameter.afs_conf', + index=6, + number=101, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3071, + serialized_end=3284, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER -_PSPARAMETER.fields_by_name['trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER +_PSPARAMETER.fields_by_name[ + 'trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER _PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER -_WORKERPARAMETER.fields_by_name['downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER -_SERVERPARAMETER.fields_by_name['downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER -_DOWNPOURWORKERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER -_DOWNPOURTRAINERPARAMETER.fields_by_name['dense_table'].message_type = _DENSETABLEPARAMETER -_DOWNPOURTRAINERPARAMETER.fields_by_name['sparse_table'].message_type = _SPARSETABLEPARAMETER -_DOWNPOURSERVERPARAMETER.fields_by_name['downpour_table_param'].message_type = _TABLEPARAMETER -_DOWNPOURSERVERPARAMETER.fields_by_name['service_param'].message_type = _SERVERSERVICEPARAMETER -_TABLEPARAMETER.fields_by_name['accessor'].message_type = _TABLEACCESSORPARAMETER +_WORKERPARAMETER.fields_by_name[ + 'downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER +_SERVERPARAMETER.fields_by_name[ + 'downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER +_DOWNPOURWORKERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'dense_table'].message_type = _DENSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'sparse_table'].message_type = _SPARSETABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'service_param'].message_type = _SERVERSERVICEPARAMETER +_TABLEPARAMETER.fields_by_name[ + 'accessor'].message_type = _TABLEACCESSORPARAMETER _TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE -_TABLEACCESSORPARAMETER.fields_by_name['sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER -_TABLEACCESSORPARAMETER.fields_by_name['dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER -_TABLEACCESSORPARAMETER.fields_by_name['downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER -_TABLEACCESSORPARAMETER.fields_by_name['table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER _DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER _DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER -_DENSESGDRULEPARAMETER.fields_by_name['summary'].message_type = _SUMMARYSGDPARAMETER -_DENSESGDRULEPARAMETER.fields_by_name['moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER -_FSCLIENTPARAMETER.fields_by_name['fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE +_DENSESGDRULEPARAMETER.fields_by_name[ + 'summary'].message_type = _SUMMARYSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name[ + 'moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER +_FSCLIENTPARAMETER.fields_by_name[ + 'fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE _FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER -DESCRIPTOR.message_types_by_name['DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER -DESCRIPTOR.message_types_by_name['DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER -DESCRIPTOR.message_types_by_name['DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER -DESCRIPTOR.message_types_by_name['ServerServiceParameter'] = _SERVERSERVICEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'ServerServiceParameter'] = _SERVERSERVICEPARAMETER DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER -DESCRIPTOR.message_types_by_name['TableAccessorParameter'] = _TABLEACCESSORPARAMETER -DESCRIPTOR.message_types_by_name['DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER -DESCRIPTOR.message_types_by_name['TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorParameter'] = _TABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE -DESCRIPTOR.message_types_by_name['SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER -DESCRIPTOR.message_types_by_name['DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER -DESCRIPTOR.message_types_by_name['MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID -PSParameter = _reflection.GeneratedProtocolMessageType('PSParameter', (_message.Message,), dict( - DESCRIPTOR = _PSPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.PSParameter) - )) +PSParameter = _reflection.GeneratedProtocolMessageType( + 'PSParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_PSPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PSParameter) + )) _sym_db.RegisterMessage(PSParameter) -WorkerParameter = _reflection.GeneratedProtocolMessageType('WorkerParameter', (_message.Message,), dict( - DESCRIPTOR = _WORKERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) - )) +WorkerParameter = _reflection.GeneratedProtocolMessageType( + 'WorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_WORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) + )) _sym_db.RegisterMessage(WorkerParameter) -ServerParameter = _reflection.GeneratedProtocolMessageType('ServerParameter', (_message.Message,), dict( - DESCRIPTOR = _SERVERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.ServerParameter) - )) +ServerParameter = _reflection.GeneratedProtocolMessageType( + 'ServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerParameter) + )) _sym_db.RegisterMessage(ServerParameter) -DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType('DownpourWorkerParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURWORKERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) - )) +DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourWorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURWORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) + )) _sym_db.RegisterMessage(DownpourWorkerParameter) -DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType('DownpourTrainerParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURTRAINERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) - )) +DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTrainerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTRAINERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) + )) _sym_db.RegisterMessage(DownpourTrainerParameter) -DenseTableParameter = _reflection.GeneratedProtocolMessageType('DenseTableParameter', (_message.Message,), dict( - DESCRIPTOR = _DENSETABLEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) - )) +DenseTableParameter = _reflection.GeneratedProtocolMessageType( + 'DenseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) + )) _sym_db.RegisterMessage(DenseTableParameter) -SparseTableParameter = _reflection.GeneratedProtocolMessageType('SparseTableParameter', (_message.Message,), dict( - DESCRIPTOR = _SPARSETABLEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) - )) +SparseTableParameter = _reflection.GeneratedProtocolMessageType( + 'SparseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) + )) _sym_db.RegisterMessage(SparseTableParameter) -DownpourServerParameter = _reflection.GeneratedProtocolMessageType('DownpourServerParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURSERVERPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) - )) +DownpourServerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURSERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) + )) _sym_db.RegisterMessage(DownpourServerParameter) -ServerServiceParameter = _reflection.GeneratedProtocolMessageType('ServerServiceParameter', (_message.Message,), dict( - DESCRIPTOR = _SERVERSERVICEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) - )) +ServerServiceParameter = _reflection.GeneratedProtocolMessageType( + 'ServerServiceParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERSERVICEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) + )) _sym_db.RegisterMessage(ServerServiceParameter) -TableParameter = _reflection.GeneratedProtocolMessageType('TableParameter', (_message.Message,), dict( - DESCRIPTOR = _TABLEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.TableParameter) - )) +TableParameter = _reflection.GeneratedProtocolMessageType( + 'TableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableParameter) + )) _sym_db.RegisterMessage(TableParameter) -TableAccessorParameter = _reflection.GeneratedProtocolMessageType('TableAccessorParameter', (_message.Message,), dict( - DESCRIPTOR = _TABLEACCESSORPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) - )) +TableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) + )) _sym_db.RegisterMessage(TableAccessorParameter) -DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType('DownpourTableAccessorParameter', (_message.Message,), dict( - DESCRIPTOR = _DOWNPOURTABLEACCESSORPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) - )) +DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) + )) _sym_db.RegisterMessage(DownpourTableAccessorParameter) -TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType('TableAccessorSaveParameter', (_message.Message,), dict( - DESCRIPTOR = _TABLEACCESSORSAVEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) - )) +TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorSaveParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORSAVEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) + )) _sym_db.RegisterMessage(TableAccessorSaveParameter) -PsRequestMessage = _reflection.GeneratedProtocolMessageType('PsRequestMessage', (_message.Message,), dict( - DESCRIPTOR = _PSREQUESTMESSAGE, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) - )) +PsRequestMessage = _reflection.GeneratedProtocolMessageType( + 'PsRequestMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSREQUESTMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) + )) _sym_db.RegisterMessage(PsRequestMessage) -SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('SparseSGDRuleParameter', (_message.Message,), dict( - DESCRIPTOR = _SPARSESGDRULEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) - )) +SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'SparseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) + )) _sym_db.RegisterMessage(SparseSGDRuleParameter) -DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType('DenseSGDRuleParameter', (_message.Message,), dict( - DESCRIPTOR = _DENSESGDRULEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) - )) +DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'DenseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) + )) _sym_db.RegisterMessage(DenseSGDRuleParameter) -AdamSGDParameter = _reflection.GeneratedProtocolMessageType('AdamSGDParameter', (_message.Message,), dict( - DESCRIPTOR = _ADAMSGDPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) - )) +AdamSGDParameter = _reflection.GeneratedProtocolMessageType( + 'AdamSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_ADAMSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) + )) _sym_db.RegisterMessage(AdamSGDParameter) -NaiveSGDParameter = _reflection.GeneratedProtocolMessageType('NaiveSGDParameter', (_message.Message,), dict( - DESCRIPTOR = _NAIVESGDPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) - )) +NaiveSGDParameter = _reflection.GeneratedProtocolMessageType( + 'NaiveSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_NAIVESGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) + )) _sym_db.RegisterMessage(NaiveSGDParameter) -SummarySGDParameter = _reflection.GeneratedProtocolMessageType('SummarySGDParameter', (_message.Message,), dict( - DESCRIPTOR = _SUMMARYSGDPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) - )) +SummarySGDParameter = _reflection.GeneratedProtocolMessageType( + 'SummarySGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SUMMARYSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) + )) _sym_db.RegisterMessage(SummarySGDParameter) -MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType('MovingAverageRuleParameter', (_message.Message,), dict( - DESCRIPTOR = _MOVINGAVERAGERULEPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) - )) +MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType( + 'MovingAverageRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_MOVINGAVERAGERULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) + )) _sym_db.RegisterMessage(MovingAverageRuleParameter) -PsResponseMessage = _reflection.GeneratedProtocolMessageType('PsResponseMessage', (_message.Message,), dict( - DESCRIPTOR = _PSRESPONSEMESSAGE, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) - )) +PsResponseMessage = _reflection.GeneratedProtocolMessageType( + 'PsResponseMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSRESPONSEMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) + )) _sym_db.RegisterMessage(PsResponseMessage) -FsClientParameter = _reflection.GeneratedProtocolMessageType('FsClientParameter', (_message.Message,), dict( - DESCRIPTOR = _FSCLIENTPARAMETER, - __module__ = 'ps_pb2' - # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) - )) +FsClientParameter = _reflection.GeneratedProtocolMessageType( + 'FsClientParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_FSCLIENTPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) + )) _sym_db.RegisterMessage(FsClientParameter) - DESCRIPTOR.has_options = True -DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\200\001\001')) +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), + _b('\200\001\001')) # @@protoc_insertion_point(module_scope) From 4f304eaa6fcdc5af93a4878a09387f4d7fbd5aed Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 13 Dec 2018 19:46:35 +0800 Subject: [PATCH 305/719] fix unittest test=develop --- paddle/fluid/framework/parallel_executor.cc | 14 +++++++++++--- .../unittests/test_parallel_executor_mnist.py | 2 ++ .../test_parallel_executor_transformer.py | 4 ---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 81d1024cb6..2604e41045 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -197,9 +197,17 @@ ParallelExecutor::ParallelExecutor( PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," "the number of places must be greater than 1."); - PADDLE_ENFORCE(exec_strategy.type_ != ExecutionStrategy::kParallelGraph, - "You should set build_strategy.reduce with 'AllReduce' for " - "the ParallelGraph executor type"); + } + + if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + PADDLE_ENFORCE( + member_->use_all_reduce_, + "build_strategy.reduce should be `AllReduce` if you want to use" + "ParallelGraph executor."); + PADDLE_ENFORCE( + member_->use_cuda_, + "execution_strategy.use_cuda should be True if you want to use" + "ParallelGraph executor."); } // Step 1. Bcast the params to devs. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3dddff0d99..0ff079b4e2 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -166,6 +166,8 @@ class TestMNIST(TestParallelExecutorBase): def check_batchnorm_fc_convergence(self, use_cuda, exec_type): if use_cuda and not core.is_compiled_with_cuda(): return + if not use_cuda and exec_type == ExecutorType.ParallelGraph: + return img, label = self._init_data() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index b5ee72a24e..8a1a3ab3ca 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -173,10 +173,6 @@ class TestTransformer(TestParallelExecutorBase): def test_main(self): if core.is_compiled_with_cuda(): self.check_network_convergence(transformer, use_cuda=True) - self.check_network_convergence( - transformer, - use_cuda=True, - exec_type=ExecutorType.ParallelGraph) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From e52bb816e36b5df53c1608f3aada655b21d11ab5 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 20:16:47 +0800 Subject: [PATCH 306/719] add copyright to __init__.py in distributed folder --- paddle/fluid/pybind/async_executor_py.cc | 16 ++++++++-------- python/paddle/fluid/distributed/__init__.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 71a0e256e4..222c128c66 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -49,13 +49,13 @@ void BindAsyncExecutor(py::module* m) { new framework::AsyncExecutor(scope, place)); })) .def("run_from_files", &framework::AsyncExecutor::RunFromFile) - .def("init_server", &framework::AsyncExecutor::InitServer) - .def("init_worker", &framework::AsyncExecutor::InitWorker) - .def("start_server", &framework::AsyncExecutor::StartServer) - .def("stop_server", &framework::AsyncExecutor::StopServer) - .def("gather_servers", &framework::AsyncExecutor::GatherServers) - .def("init_model", &framework::AsyncExecutor::InitModel) - .def("save_model", &framework::AsyncExecutor::SaveModel); + .def("init_server", &framework::AsyncExecutor::InitServer) + .def("init_worker", &framework::AsyncExecutor::InitWorker) + .def("start_server", &framework::AsyncExecutor::StartServer) + .def("stop_server", &framework::AsyncExecutor::StopServer) + .def("gather_servers", &framework::AsyncExecutor::GatherServers) + .def("init_model", &framework::AsyncExecutor::InitModel) + .def("save_model", &framework::AsyncExecutor::SaveModel); } // end BindAsyncExecutor #else void BindAsyncExecutor(py::module* m) { @@ -64,7 +64,7 @@ void BindAsyncExecutor(py::module* m) { return std::unique_ptr( new framework::AsyncExecutor(scope, place)); })) - .def("run_from_files", &framework::AsyncExecutor::RunFromFile) + .def("run_from_files", &framework::AsyncExecutor::RunFromFile); } // end BindAsyncExecutor #endif } // end namespace pybind diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py index e69de29bb2..cd609c5040 100644 --- a/python/paddle/fluid/distributed/__init__.py +++ b/python/paddle/fluid/distributed/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and From bf9302f95015db6cadf3e814cfc4f21ef8434a3d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 10:18:22 +0000 Subject: [PATCH 307/719] add lstm, peephole refer and test --- paddle/fluid/operators/jit/gen_base.cc | 5 - paddle/fluid/operators/jit/gen_base.h | 4 - paddle/fluid/operators/jit/helper.cc | 20 +++ paddle/fluid/operators/jit/helper.h | 4 +- paddle/fluid/operators/jit/kernel_base.h | 54 ++++++- paddle/fluid/operators/jit/kernel_key.cc | 38 +++++ paddle/fluid/operators/jit/kernel_key.h | 4 + .../fluid/operators/jit/refer/CMakeLists.txt | 2 + paddle/fluid/operators/jit/refer/refer.cc | 3 + paddle/fluid/operators/jit/refer/refer.h | 89 ++++++++++++ paddle/fluid/operators/jit/test.cc | 137 ++++++++++++++++++ paddle/fluid/operators/math/jit_kernel_impl.h | 39 ----- .../fluid/operators/math/jit_kernel_refer.h | 85 ----------- 13 files changed, 346 insertions(+), 138 deletions(-) create mode 100644 paddle/fluid/operators/jit/kernel_key.cc diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index a8bf902963..310da0c76f 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -23,11 +23,6 @@ namespace paddle { namespace operators { namespace jit { -template <> -size_t JitCodeKey(int d) { - return d; -} - // refer do not need useme, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 586f4389c0..48855abd26 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -43,10 +43,6 @@ class GenBase : public Kernel { void dumpCode(const unsigned char* code) const; }; -// Every JitCode should have a method to get the key from attribution -template -size_t JitCodeKey(Attr attr); - // Creator is used to creat the jitcode and save in pool. // Every JitCode should have one creator. class GenCreator { diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index c010b64c9c..d6fa4891e3 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/helper.h" +#include // tolower #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -36,6 +37,8 @@ const char* to_string(KernelType kt) { ONE_CASE(vexp); ONE_CASE(vsigmoid); ONE_CASE(vtanh); + ONE_CASE(lstmctht); + ONE_CASE(lstmc1h1); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; @@ -44,6 +47,23 @@ const char* to_string(KernelType kt) { } #undef ONE_CASE +KernelType to_kerneltype(const std::string& act) { + std::string lower = act; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + if (lower == "relu" || lower == "vrelu") { + return vrelu; + } else if (lower == "identity" || lower == "videntity" || lower == "") { + return videntity; + } else if (lower == "exp" || lower == "vexp") { + return vexp; + } else if (lower == "sigmoid" || lower == "vsigmoid") { + return vsigmoid; + } else if (lower == "tanh" || lower == "vtanh") { + return vtanh; + } + return non_kernel; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 053e5ed079..302e70caa7 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,9 +14,7 @@ #pragma once -#include // for unique_ptr #include -#include #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -124,6 +122,8 @@ typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { const char* to_string(KernelType kt); +KernelType to_kerneltype(const std::string& act); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 29b881b754..3ab0194ce2 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -20,8 +20,9 @@ namespace operators { namespace jit { typedef enum { - vmul = 0, - vadd = 1, + non_kernel = 0, + vmul = 1, + vadd = 2, vaddrelu, vsub, vscal, @@ -30,7 +31,9 @@ typedef enum { videntity, vexp, vsigmoid, - vtanh + vtanh, + lstmctht, + lstmc1h1 } KernelType; template @@ -50,6 +53,51 @@ struct XYNTuples { typedef void (*func_type)(const T*, T*, int); }; +typedef struct { + void* gates; // gates: x_ch, x_ih, x_fh, x_oh + const void* ct_1; + void* ct; + void* ht; + /* weight_peephole and checked data are only used in peephole*/ + const void* wp{nullptr}; // W_ic, W_fc, W_oc + void* checked{nullptr}; // size: 2 * d +} lstm_t; + +typedef struct { + void* gates; // gates: {x_update, x_reset; x_state} + const void* ht_1; + void* ht; +} gru_t; + +struct rnn_attr_s { + int d; + KernelType act_gate, act_cand; + rnn_attr_s() = default; + rnn_attr_s(int _d, KernelType _act_gate, KernelType _act_cand) + : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} +}; + +struct lstm_attr_s : public rnn_attr_s { + bool use_peephole; + KernelType act_cell; + lstm_attr_s() = default; + lstm_attr_s(int _d, KernelType _act_gate, KernelType _act_cand, + KernelType _act_cell, bool _use_peephole = false) + : rnn_attr_s(_d, _act_gate, _act_cand), + use_peephole(_use_peephole), + act_cell(_act_cell) {} +}; + +typedef struct rnn_attr_s gru_attr_t; +typedef struct lstm_attr_s lstm_attr_t; + +template +struct LSTMTuples { + typedef T data_type; + typedef lstm_attr_t attr_type; + typedef void (*func_type)(lstm_t*, const lstm_attr_t*); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc new file mode 100644 index 0000000000..7a9ae81f89 --- /dev/null +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/kernel_key.h" + +namespace paddle { +namespace operators { +namespace jit { + +template <> +size_t JitCodeKey(const int& d) { + return d; +} + +template <> +size_t JitCodeKey(const lstm_attr_t& attr) { + constexpr int act_type_shift = 3; // suppot 2^3 act types + size_t key = attr.d; + int gate_key = static_cast(attr.act_gate) << 1; + int cand_key = static_cast(attr.act_cand) << (1 + act_type_shift); + int cell_key = static_cast(attr.act_cell) << (1 + act_type_shift * 2); + return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + + attr.use_peephole; +} +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h index af9df77337..611a0210d6 100644 --- a/paddle/fluid/operators/jit/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -44,6 +44,10 @@ struct KernelKey { bool operator!=(const KernelKey& o) const { return !(*this == o); } }; +// Every JitCode should have a method to get the key from attribution +template +size_t JitCodeKey(const Attr& attr); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index dc07ddb914..e30923c4fd 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -18,3 +18,5 @@ USE_JITKERNEL_REFER(videntity) USE_JITKERNEL_REFER(vexp) USE_JITKERNEL_REFER(vsigmoid) USE_JITKERNEL_REFER(vtanh) +USE_JITKERNEL_REFER(lstmctht) +USE_JITKERNEL_REFER(lstmc1h1) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index f716ca89c5..59b3ce5248 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -35,4 +35,7 @@ REGISTER_REFER_KERNEL(vexp, VExp); REGISTER_REFER_KERNEL(vsigmoid, VSigmoid); REGISTER_REFER_KERNEL(vtanh, VTanh); +REGISTER_REFER_KERNEL(lstmctht, LSTMCtHt); +REGISTER_REFER_KERNEL(lstmc1h1, LSTMC1H1); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 7ef60a2d53..a93123df9d 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -110,6 +110,91 @@ void VTanh(const T* x, T* y, int n) { } } +template +void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT + if (type == vsigmoid) { + return VSigmoid; + } else if (type == vrelu) { + return VRelu; + } else if (type == vtanh) { + return VTanh; + } else if (type == videntity) { + return VIdentity; + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +// compute ct and ht +template +void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + const T* ct_1 = reinterpret_cast(step->ct_1); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + const T* wp = reinterpret_cast(step->wp); + T* checked = reinterpret_cast(step->checked); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + // gates: W_ch, W_ih, W_fh, W_oh + if (attr->use_peephole) { + VMul(wp, ct_1, checked, d); + VMul(wp + d, ct_1, checked + d, d); + VAdd(checked, gates + d, gates + d, d2); + act_gate(gates + d, gates + d, d2); + } else { + act_gate(gates + d, gates + d, d3); + } + + // C_t = C_t-1 * fgated + cand_gated * igated + act_cand(gates, gates, d); + VMul(gates, gates + d, gates + d, d); + VMul(ct_1, gates + d2, gates + d2, d); + VAdd(gates + d, gates + d2, ct, d); + + if (attr->use_peephole) { + // get ogated + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + act_gate(gates + d3, gates + d3, d); + } + // H_t = act_cell(C_t) * ogated + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + +// compute c1 and h1 without c0 or h0 +template +void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + /* C_t = igated * cgated*/ + act_gate(gates + d, gates + d, d); + act_cand(gates, gates, d); + VMul(gates, gates + d, ct, d); + if (attr->use_peephole) { + // get outgated, put W_oc * C_t on igated + const T* wp = reinterpret_cast(step->wp); + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + } + /* H_t = act_cell(C_t) * ogated */ + act_gate(gates + d3, gates + d3, d); + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -134,6 +219,10 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples); DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); DECLARE_REFER_KERNEL(VTanh, XYNTuples); +// lstm_t* , const lstm_attr_t* +DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); +DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 4c9b853b6e..03e56416b2 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -350,6 +350,143 @@ TEST(JITKernel, vtanh) { TestXYNKernel(); } +template +void TestLSTMFunc(const typename KernelTuples::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const paddle::operators::jit::lstm_attr_t& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + paddle::operators::jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); +} + +template +void TestLSTMKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; + for (int d : TestSizes()) { + for (bool use_peephole : {true, false}) { + for (auto& act_gate : all_acts) { + for (auto& act_cand : all_acts) { + for (auto& act_cell : all_acts) { + std::string info = act_gate + act_cand + act_cell + + (use_peephole ? "peephole_" : "") + "size_" + + std::to_string(d); + const jit::lstm_attr_t attr( + d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), + jit::to_kerneltype(act_cell), use_peephole); + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector xsrc(4 * d), wp(3 * d), ct_1(d); + std::vector ct_ref(d), ht_ref(d), checked(2 * d); + RandomVec(4 * d, xsrc.data(), -2.f, 2.f); + RandomVec(3 * d, wp.data(), -2.f, 2.f); + RandomVec(d, ct_1.data(), -2.f, 2.f); + // x could be changed after compute, so copy to save src + std::vector x(xsrc.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + T* x_data = x.data(); + T* checked_data = checked.data(); + T* ct_ref_data = ct_ref.data(); + T* ht_ref_data = ht_ref.data(); + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_ref_data; + step.ht = ht_ref_data; + if (use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + ref(&step, &attr); + + // test jitcode + auto jitcode = + jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel " << info; + TestLSTMFunc>(jitcode, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel " << info; + TestLSTMFunc>(more, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + TestLSTMFunc>(tgt, xsrc, wp, ct_1, ct_ref, + ht_ref, attr); + } + } + } + } + } +} + +TEST(JITKernel, lstmctht) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +TEST(JITKernel, lstmc1h1) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +// TODO(TJ): refine the tests template + TEST(JITKernel, pool) { // TODO(TJ): add some test } diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index ba5f20e533..025343dfad 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -28,45 +28,6 @@ namespace jitkernel { #define YMM_FLOAT_BLOCK 8 #define ZMM_FLOAT_BLOCK 16 -typedef struct { - void* gates; // gates: W_ch, W_ih, W_fh, W_oh - const void* ct_1; - void* ct; - void* ht; - /* weight_peephole and checked data are only used in peephole*/ - const void* wp{nullptr}; - void* checked{nullptr}; -} lstm_t; - -typedef struct { - void* gates; // gates: {W_update, W_reset; W_state} - const void* ht_1; - void* ht; -} gru_t; - -struct rnn_attr_s { - int d; - std::string act_gate, act_cand; - rnn_attr_s() = default; - rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand) - : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} -}; - -struct lstm_attr_s : public rnn_attr_s { - bool use_peephole; - std::string act_cell; - lstm_attr_s() = default; - lstm_attr_s(int _d, const std::string& _act_gate, - const std::string& _act_cand, const std::string& _act_cell, - bool _use_peephole = false) - : rnn_attr_s(_d, _act_gate, _act_cand), - use_peephole(_use_peephole), - act_cell(_act_cell) {} -}; - -typedef struct rnn_attr_s gru_attr_t; -typedef struct lstm_attr_s lstm_attr_t; - } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index a03e851de5..122cbcb0d6 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -24,91 +24,6 @@ namespace math { namespace jitkernel { namespace refer { -template -void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT - if (type == "sigmoid") { - return VSigmoid; - } else if (type == "relu") { - return VRelu; - } else if (type == "tanh") { - return VTanh; - } else if (type == "identity" || type == "") { - return VIdentity; - } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} - -// compute ct and ht -template -void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - const T* ct_1 = reinterpret_cast(step->ct_1); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - const T* wp = reinterpret_cast(step->wp); - T* checked = reinterpret_cast(step->checked); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - auto act_cell = getActFunc(attr->act_cell); - int d = attr->d; - int d2 = d * 2; - int d3 = d * 3; - // gates: W_ch, W_ih, W_fh, W_oh - if (attr->use_peephole) { - VMul(wp, ct_1, checked, d); - VMul(wp + d, ct_1, checked + d, d); - VAdd(checked, gates + d, gates + d, d2); - act_gate(gates + d, gates + d, d2); - } else { - act_gate(gates + d, gates + d, d3); - } - - // C_t = C_t-1 * fgated + cand_gated * igated - act_cand(gates, gates, d); - VMul(gates, gates + d, gates + d, d); - VMul(ct_1, gates + d2, gates + d2, d); - VAdd(gates + d, gates + d2, ct, d); - - if (attr->use_peephole) { - // get ogated - VMul(wp + d2, ct, gates + d, d); - VAdd(gates + d, gates + d3, gates + d3, d); - act_gate(gates + d3, gates + d3, d); - } - // H_t = act_cell(C_t) * ogated - act_cell(ct, gates + d2, d); - VMul(gates + d2, gates + d3, ht, d); -} - -// compute c1 and h1 without c0 or h0 -template -void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - auto act_cell = getActFunc(attr->act_cell); - int d = attr->d; - int d2 = d * 2; - int d3 = d * 3; - /* C_t = igated * cgated*/ - act_gate(gates + d, gates + d, d); - act_cand(gates, gates, d); - VMul(gates, gates + d, ct, d); - if (attr->use_peephole) { - // get outgated, put W_oc * C_t on igated - const T* wp = reinterpret_cast(step->wp); - VMul(wp + d2, ct, gates + d, d); - VAdd(gates + d, gates + d3, gates + d3, d); - } - /* H_t = act_cell(C_t) * ogated */ - act_gate(gates + d3, gates + d3, d); - act_cell(ct, gates + d2, d); - VMul(gates + d2, gates + d3, ht, d); -} - // compute h1 without h0 template void GRUH1(gru_t* step, const gru_attr_t* attr) { From e2130502234f042a6381939f80c640bbebe2e1c6 Mon Sep 17 00:00:00 2001 From: Wang Guibao Date: Thu, 13 Dec 2018 20:49:50 +0800 Subject: [PATCH 308/719] Fix multi-threading bug with WItH_MKL=ON (#14882) fixes #14884 --- paddle/fluid/framework/executor_thread_worker.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 3d53511615..5fc5aeb662 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" namespace paddle { @@ -174,6 +175,8 @@ void print_fetch_var(Scope* scope, std::string var_name) { } void ExecutorThreadWorker::TrainFiles() { + platform::SetNumThreads(1); + // todo: configurable SetDevice(); From 6eec46172560e9b217eab8c33d9b45aaa3ac5f1b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 13:04:29 +0000 Subject: [PATCH 309/719] add lstm peephole benchmark --- paddle/fluid/operators/jit/benchmark.cc | 97 +++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index a7e5eb6cf4..01467e324c 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -272,6 +272,98 @@ void BenchXYNKernel() { } } +// return this function avg time +template +double BenchLSTMFunc(const typename KernelTuples::func_type tgt, + const paddle::operators::jit::lstm_attr_t* attr, + paddle::operators::jit::lstm_t* step) { + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(step, attr); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(step, attr); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchLSTMKernel() { + namespace jit = paddle::operators::jit; + for (bool use_peephole : {true, false}) { + for (int d : TestSizes()) { + const jit::lstm_attr_t attr(d, jit::vsigmoid, jit::vtanh, jit::vtanh, + use_peephole); + std::vector> infos; + std::vector x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); + RandomVec(4 * d, x.data(), -2.f, 2.f); + RandomVec(3 * d, wp.data(), -2.f, 2.f); + RandomVec(d, ct_1.data(), -2.f, 2.f); + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + T* x_data = x.data(); + T* checked_data = checked.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchLSTMFunc>(refer, &attr, &step); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + auto res = BenchLSTMFunc>(jitcode, &attr, &step); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + auto res = BenchLSTMFunc>(more, &attr, &step); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchLSTMFunc>(tgt, &attr, &step); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) + << ", Sigmoid,Tanh,Tanh, " << (use_peephole ? "Peephole_" : "") + << " size " << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -294,9 +386,14 @@ int main(int argc, char* argv[]) { BenchAXYNKernel(); BenchAXYNKernel(); + // act BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); + + // lstm and peephole + BenchLSTMKernel(); + BenchLSTMKernel(); } From 3dc29b390537cca68f43f21f44c2c2fde84fa297 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 13 Dec 2018 22:02:55 +0800 Subject: [PATCH 310/719] change sparse_update to adam_update --- paddle/fluid/operators/optimizers/adam_op.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index ca5454ef04..25e23c5f9d 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -202,7 +202,7 @@ struct SparseAdamFunctor { row_count_(row_count), sparse_mode_(sparse_mode) {} - inline HOSTDEVICE void sparse_update(size_t i, T g) const { + inline HOSTDEVICE void adam_update(size_t i, T g) const { // The following code is the same as dense T mom1 = moment1_[i]; T mom2 = moment2_[i]; @@ -228,7 +228,7 @@ struct SparseAdamFunctor { auto row_idx = math::BinarySearch(rows_, row_count_, i / row_numel_); T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; - sparse_update(i, g); + adam_update(i, g); } }; @@ -364,7 +364,7 @@ class AdamOpKernel : public framework::OpKernel { for (size_t offset = 0; offset < row_numel; ++offset) { size_t i = rows[row_index] * row_numel + offset; T g = grad_data[row_index * row_numel + offset]; - functor.sparse_update(i, g); + functor.adam_update(i, g); } } } else { From 00d3afbcc9959cae839b134e0c0743a335b9f2a0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 14:12:20 +0000 Subject: [PATCH 311/719] add gru refer functions, test and benchmark --- paddle/fluid/operators/jit/README.md | 4 +- paddle/fluid/operators/jit/benchmark.cc | 84 +++++++++++++++ paddle/fluid/operators/jit/helper.cc | 3 + paddle/fluid/operators/jit/kernel_base.h | 12 ++- paddle/fluid/operators/jit/kernel_key.cc | 11 +- .../fluid/operators/jit/refer/CMakeLists.txt | 3 + paddle/fluid/operators/jit/refer/refer.cc | 4 + paddle/fluid/operators/jit/refer/refer.h | 54 +++++++++- paddle/fluid/operators/jit/test.cc | 102 ++++++++++++++++++ .../fluid/operators/math/jit_kernel_refer.h | 49 +-------- 10 files changed, 274 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 2d72aa4d56..6b2f2b2848 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -45,4 +45,6 @@ PaddlePaddle/Paddle/paddle/fluid/ - 在`KernelType` 中添加 `your_key` . - 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. -- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`. +- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 +- 添加unit test,需要测试float和double +- 添加benchmark确保get得到的速度是最快。 diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 01467e324c..ca636b020c 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -364,6 +364,85 @@ void BenchLSTMKernel() { } } +// return this function avg time +template +double BenchGRUFunc(const typename KernelTuples::func_type tgt, + const paddle::operators::jit::gru_attr_t* attr, + paddle::operators::jit::gru_t* step) { + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(step, attr); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(step, attr); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; +} + +template +void BenchGRUKernel() { + namespace jit = paddle::operators::jit; + for (int d : TestSizes()) { + const jit::gru_attr_t attr(d, jit::vsigmoid, jit::vtanh); + std::vector> infos; + std::vector x(3 * d), ht_1(d), ht(d); + RandomVec(3 * d, x.data(), -2.f, 2.f); + RandomVec(d, ht_1.data(), -2.f, 2.f); + const T* ht_1_data = ht_1.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + + // test refer + auto refer = jit::GetRefer>(); + if (refer) { + auto res = BenchGRUFunc>(refer, &attr, &step); + infos.push_back(std::make_pair("Refer", res)); + } + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + auto res = BenchGRUFunc>(jitcode, &attr, &step); + infos.push_back(std::make_pair("JitCode", res)); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = + dynamic_cast>*>(impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + auto res = BenchGRUFunc>(more, &attr, &step); + infos.push_back(std::make_pair("More", res)); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + if (!tgt) { + LOG(ERROR) << "Target can not be empty!"; + } + auto res = BenchGRUFunc>(tgt, &attr, &step); + infos.push_back(std::make_pair("Target", res)); + // print + std::ostringstream loginfos; + loginfos << "Kernel Type: " << jit::to_string(KT) << ", Sigmoid,Tanh, size " + << d << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); + } +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: @@ -396,4 +475,9 @@ int main(int argc, char* argv[]) { // lstm and peephole BenchLSTMKernel(); BenchLSTMKernel(); + + // gru functions + BenchGRUKernel(); + BenchGRUKernel(); + BenchGRUKernel(); } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index d6fa4891e3..0543b0743c 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -39,6 +39,9 @@ const char* to_string(KernelType kt) { ONE_CASE(vtanh); ONE_CASE(lstmctht); ONE_CASE(lstmc1h1); + ONE_CASE(gruh1); + ONE_CASE(gruhtpart1); + ONE_CASE(gruhtpart2); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 3ab0194ce2..00d583c60b 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -33,7 +33,10 @@ typedef enum { vsigmoid, vtanh, lstmctht, - lstmc1h1 + lstmc1h1, + gruh1, + gruhtpart1, + gruhtpart2 } KernelType; template @@ -98,6 +101,13 @@ struct LSTMTuples { typedef void (*func_type)(lstm_t*, const lstm_attr_t*); }; +template +struct GRUTuples { + typedef T data_type; + typedef gru_attr_t attr_type; + typedef void (*func_type)(gru_t*, const gru_attr_t*); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 7a9ae81f89..4e6a19f04f 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -23,9 +23,10 @@ size_t JitCodeKey(const int& d) { return d; } +constexpr int act_type_shift = 3; // suppot 2^3 act types + template <> size_t JitCodeKey(const lstm_attr_t& attr) { - constexpr int act_type_shift = 3; // suppot 2^3 act types size_t key = attr.d; int gate_key = static_cast(attr.act_gate) << 1; int cand_key = static_cast(attr.act_cand) << (1 + act_type_shift); @@ -33,6 +34,14 @@ size_t JitCodeKey(const lstm_attr_t& attr) { return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + attr.use_peephole; } + +template <> +size_t JitCodeKey(const gru_attr_t& attr) { + size_t key = attr.d; + return (key << (act_type_shift * 2)) + static_cast(attr.act_gate) + + (static_cast(attr.act_cand) << act_type_shift); +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index e30923c4fd..78d1cb8f9a 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -20,3 +20,6 @@ USE_JITKERNEL_REFER(vsigmoid) USE_JITKERNEL_REFER(vtanh) USE_JITKERNEL_REFER(lstmctht) USE_JITKERNEL_REFER(lstmc1h1) +USE_JITKERNEL_REFER(gruh1) +USE_JITKERNEL_REFER(gruhtpart1) +USE_JITKERNEL_REFER(gruhtpart2) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 59b3ce5248..c99174a66f 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -38,4 +38,8 @@ REGISTER_REFER_KERNEL(vtanh, VTanh); REGISTER_REFER_KERNEL(lstmctht, LSTMCtHt); REGISTER_REFER_KERNEL(lstmc1h1, LSTMC1H1); +REGISTER_REFER_KERNEL(gruh1, GRUH1); +REGISTER_REFER_KERNEL(gruhtpart1, GRUHtPart1); +REGISTER_REFER_KERNEL(gruhtpart2, GRUHtPart2); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index a93123df9d..a9a6ffbccd 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -125,6 +125,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT return nullptr; } +// TODO(TJ): add refer gemm and make LSTM kernels combine as same GRU kernels + // compute ct and ht template void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { @@ -195,6 +197,51 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { VMul(gates + d2, gates + d3, ht, d); } +// compute h1 without h0 +template +void GRUH1(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + int d2 = d * 2; + act_gate(gates, gates, d); + act_cand(gates + d2, gates + d2, d); + VMul(gates, gates + d2, ht, d); +} + +// compute the first part of GRU: ht = act_gate(r) * ht_1 +template +void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { + // W: {W_update, W_reset; W_state} + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + act_gate(gates + attr->d, gates + attr->d, attr->d); + VMul(ht_1, gates + attr->d, ht, attr->d); +} + +// compute the second part of GRU: +// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 +template +void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + T* y = gates + d * 2; + act_gate(gates, gates, d); + act_cand(y, y, d); + // out = zt*ht~ + (1-zt)*ht_1 + for (int i = 0; i < d; ++i) { + ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -219,10 +266,15 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples); DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); DECLARE_REFER_KERNEL(VTanh, XYNTuples); -// lstm_t* , const lstm_attr_t* +// lstm_t*, const lstm_attr_t* DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); +// gru_t*, const gru_attr_t* +DECLARE_REFER_KERNEL(GRUH1, GRUTuples); +DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); +DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 03e56416b2..d994a11f97 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -485,6 +485,108 @@ TEST(JITKernel, lstmc1h1) { TestLSTMKernel(); } +template +void TestGRUFunc(const typename KernelTuples::func_type tgt, + const std::vector& xsrc, const std::vector& ht_1, + const std::vector& ht_ref, + const paddle::operators::jit::gru_attr_t& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + paddle::operators::jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); +} + +template +void TestGRUKernel() { + namespace jit = paddle::operators::jit; + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; + for (int d : TestSizes()) { + for (auto& act_gate : all_acts) { + for (auto& act_cand : all_acts) { + std::string info = act_gate + act_cand + "size_" + std::to_string(d); + const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), + jit::to_kerneltype(act_cand)); + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector xsrc(3 * d), ht_1(d), ht_ref(d); + RandomVec(3 * d, xsrc.data(), -2.f, 2.f); + RandomVec(d, ht_1.data(), -2.f, 2.f); + // x could be changed after compute, so copy to save src + std::vector x(xsrc.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + T* x_data = x.data(); + T* ht_ref_data = ht_ref.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_ref_data; + ref(&step, &attr); + + // test jitcode + auto jitcode = jit::GetJitCode, PlaceType>(attr); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel " << info; + TestGRUFunc>(jitcode, xsrc, ht_1, ht_ref, attr); + } + + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast>*>( + impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel " << info; + TestGRUFunc>(more, xsrc, ht_1, ht_ref, attr); + } + } + } + // Test result from Get function + auto tgt = jit::Get, PlaceType>(attr); + TestGRUFunc>(tgt, xsrc, ht_1, ht_ref, attr); + } + } + } +} + +TEST(JITKernel, gruh1) { + namespace jit = paddle::operators::jit; + TestGRUKernel(); + TestGRUKernel(); +} + +TEST(JITKernel, gruhtpart1) { + namespace jit = paddle::operators::jit; + TestGRUKernel(); + TestGRUKernel(); +} + +TEST(JITKernel, gruhtpart2) { + namespace jit = paddle::operators::jit; + TestGRUKernel(); + TestGRUKernel(); +} + // TODO(TJ): refine the tests template TEST(JITKernel, pool) { diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 122cbcb0d6..d49fc935dc 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -22,54 +22,7 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace refer { - -// compute h1 without h0 -template -void GRUH1(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - int d = attr->d; - int d2 = d * 2; - act_gate(gates, gates, d); - act_cand(gates + d2, gates + d2, d); - VMul(gates, gates + d2, ht, d); -} - -// compute the first part of GRU: ht = act_gate(r) * ht_1 -template -void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { - // W: {W_update, W_reset; W_state} - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - auto act_gate = getActFunc(attr->act_gate); - act_gate(gates + attr->d, gates + attr->d, attr->d); - VMul(ht_1, gates + attr->d, ht, attr->d); -} - -// compute the second part of GRU: -// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 -template -void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - int d = attr->d; - T* y = gates + d * 2; - act_gate(gates, gates, d); - act_cand(y, y, d); - // out = zt*ht~ + (1-zt)*ht_1 - for (int i = 0; i < d; ++i) { - ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; - } -} - -} // namespace refer +namespace refer {} // namespace refer } // namespace jitkernel } // namespace math } // namespace operators From 854ee964e81b2907ca15f201c60e941703f7a909 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 22:41:08 +0800 Subject: [PATCH 312/719] add doc string for async_executor.py --- python/paddle/fluid/async_executor.py | 150 +++++++++++++++++++------- 1 file changed, 111 insertions(+), 39 deletions(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 099805ac1b..fe2e9b8f12 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -89,8 +89,14 @@ class AsyncExecutor(object): self.executor = core.AsyncExecutor(scope, p) self.instance = None - - def run(self, program, data_feed, filelist, thread_num, fetch, mode="", debug=False): + def run(self, + program, + data_feed, + filelist, + thread_num, + fetch, + mode="", + debug=False): """ Run program by this AsyncExecutor. Training dataset will be in filelist. Users can also inspect certain variables by naming them in parameter @@ -110,6 +116,7 @@ class AsyncExecutor(object): thread_num(int): number of concurrent training threads. See :code:`Note` for how to set this properly fetch(str|list): the var name or a list of var names to inspect + mode(str): run mode of this interface debug(bool): When set to True, fetch vars will be printed to standard output after each minibatch @@ -154,83 +161,148 @@ class AsyncExecutor(object): data_feed.desc(), filelist, thread_num, fetch_var_names, mode, debug) - def download_data(self, afs_path, local_path, fs_default_name, ugi, file_cnt, hadoop_home="$HADOOP_HOME", process_num=12): + def download_data(self, + afs_path, + local_path, + fs_default_name, + ugi, + file_cnt, + hadoop_home="$HADOOP_HOME", + process_num=12): + """ + download_data is a default download method for distributed training + a user download data without this method + + Example: + >>> exe = fluid.AsyncExecutor() + >>> exe.download_data("/xxx/xxx/xx/", + >>> "./data", "afs:// + >>> xxx.xxx.xxx.xxx:9901", "xxx,yyy") + Args: + afs_path(str): afs_path defined by users + local_path(str): download data path + fs_default_name(str): file system server address + ugi(str): hadoop ugi + file_cn(int): a user can specify file number for debugging + hadoop_home(str): hadoop home path + process_num(int): download process num + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') - - configs = { - "fs.default.name": fs_default_name, - "hadoop.job.ugi": ugi - } + raise ValueError('instance is None, please run' + 'config_distributed_nodes init instance') + + configs = {"fs.default.name": fs_default_name, "hadoop.job.ugi": ugi} client = hdfs.HDFSClient(hadoop_home, configs) downloads = hdfs.multi_download( client, - afs_path, - local_path, + afs_path, + local_path, self.instance.get_worker_index(), self.instance.get_node_cnt() / 2, file_cnt, multi_processes=process_num) - #self.instance.barrier_all() #wait for download_data #TODO only barriere worker - self.instance.barrier_worker() #wait for download_data #TODO only barriere worker - - def config_distributed_nodes(self): - self.instance = ps_instance.PaddlePSInstance(1, 2) - return self.instance - - # get total rank - # get rank index - # get iplists - # get hadoop info - pass + self.instance.barrier_worker() #wait for download_data def get_instance(self): + """ + get current node's instance so that user can do operations + in distributed setting + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + return self.instance + + def config_distributed_nodes(self): + """ + if a user needs to run distributed async executor + he or she needs to do a global configuration so that + information of current process can be obtained + """ + self.instance = ps_instance.PaddlePSInstance(1, 2) return self.instance def stop(self): + """ + at the end of process, users should call stop to servers + and barrier all workers + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') - self.instance.barrier_worker() #worker do all things + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) + self.instance.barrier_worker() #worker do all things if self.instance.is_first_worker(): self.executor.stop_server() - self.instance.barrier_worker() #sync + self.instance.barrier_worker() #sync def init_server(self, dist_desc): + """ + initialize server of current node if current process is a server + Args: + dist_desc(str): a protobuf string that describes + how to init a worker and a server + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) self.executor.init_server(dist_desc, self.instance._rankid) ip = self.executor.start_server() self.instance.set_ip(ip) - self.instance.barrier_all() #wait all server start + self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() self.executor.gather_servers(ips, self.instance.get_node_cnt()) - self.instance.barrier_all() #wait all worker start + self.instance.barrier_all() #wait all worker start def init_worker(self, dist_desc, startup_program): + """ + initialize worker of current node if current process is a worker + Args: + dist_desc(str): a protobuf string that describes + how to init a worker and a server + startup_program(fluid.Program): startup program of current process + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) place = core.CPUPlace() executor = Executor(place) executor.run(startup_program) - self.instance.barrier_all() #wait all server start + self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() - self.executor.init_worker(dist_desc, ips, self.instance.get_node_cnt(), self.instance._rankid) - self.instance.barrier_all() #wait all worker start + self.executor.init_worker(dist_desc, ips, + self.instance.get_node_cnt(), + self.instance._rankid) + self.instance.barrier_all() #wait all worker start if self.instance.is_first_worker(): self.executor.init_model() - self.instance.barrier_worker() #wait init model - + self.instance.barrier_worker() #wait init model + def init_model(self): + """ + init_model command that can be invoked from one of the worker + model parameters are initialized in servers + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) self.executor.init_model() def save_model(self, save_path): + """ + save_model command that can be invoked from one of the worker + model parameters are saved in servers and upload to save_path of file system + Args: + save_path(str): path to file system + """ if self.instance is None: - raise ValueError('instance is None, please run config_distributed_nodes init instance') + raise ValueError( + 'instance is None, please run config_distributed_nodes init instance' + ) self.executor.save_model(save_path) - From 3759600019f206794d5852bbbc74fd959337cf3d Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Thu, 13 Dec 2018 23:01:53 +0800 Subject: [PATCH 313/719] add doc string for downpour.py and distribute_lookup_table.py --- .../paddle/fluid/distribute_lookup_table.py | 32 ++++++++++++--- python/paddle/fluid/distributed/downpour.py | 41 ++++++++++++++----- 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py index 243d806c41..74824f6832 100644 --- a/python/paddle/fluid/distribute_lookup_table.py +++ b/python/paddle/fluid/distribute_lookup_table.py @@ -16,31 +16,51 @@ LOOKUP_TABLE_TYPE = "lookup_table" def find_distributed_lookup_table_inputs(program, table_name): + """ + Find input variable of distribute lookup table in program. + We only support one distribute table now. + Args: + program(Program): given program, locate distributed lookup table + table_name(str): given table name that is found beforehand + Returns: + inputs + """ local_vars = program.current_block().vars inputs = [] for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - inputs.extend( - [local_vars[name] for name in op.input("Ids")]) + inputs.extend([local_vars[name] for name in op.input("Ids")]) return inputs + def find_distributed_lookup_table_outputs(program, table_name): + """ + Find output variable of distribute lookup table in program. + We only support one distribute table now. + Args: + program(Program): given program, locate distributed lookup table + table_name(str): given table name that is found beforehand + Returns: + outputs + """ local_vars = program.current_block().vars outputs = [] for op in program.global_block().ops: if op.type == LOOKUP_TABLE_TYPE: if table_name == op.input("W")[0]: - outputs.extend( - [local_vars[name] for name in op.output("Out")]) + outputs.extend([local_vars[name] for name in op.output("Out")]) return outputs + def find_distributed_lookup_table(program): """ Find distribute lookup table in program. We only support one distribute table now. - :param program: - :return: table_name or None + Args: + program(Program): given program, locate distributed lookup table + Returns: + table_name or None """ table_name = None diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 9ef9e14ccc..87dfab92c5 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -20,6 +20,7 @@ from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_i from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs from google.protobuf import text_format + class DownpourSGD(object): """ Distributed optimizer of downpour stochastic gradient descent @@ -35,17 +36,38 @@ class DownpourSGD(object): downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2) downpour_sgd.minimize(cost) """ + def __init__(self, learning_rate=0.001, window=1): # todo(guru4elephant): add more optimizers here as argument # todo(guru4elephant): make learning_rate as a variable self.learning_rate_ = learning_rate self.window_ = window self.type = "downpour" - - def minimize(self, loss, startup_program=None, - parameter_list=None, no_grad_set=None): - params_grads = sorted(append_backward( - loss, parameter_list, no_grad_set), key=lambda x:x[0].name) + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + """ + DownpounSGD is a distributed optimizer so + that user can call minimize to generate backward + operators and optimization operators within minmize function + Args: + loss(Variable): loss variable defined by user + startup_program(Program): startup program that defined by user + parameter_list(str list): parameter names defined by users + no_grad_set(set): a set of variables that is defined by users + so that these variables do not need gradient computation + Returns: + [ps_param, worker_skipped_ops] + ps_param: parameter server protobuf desc + worker_skipped_ops: operator names that need + to be skipped during execution + """ + params_grads = sorted( + append_backward(loss, parameter_list, no_grad_set), + key=lambda x: x[0].name) table_name = find_distributed_lookup_table(loss.block.program) prefetch_slots = find_distributed_lookup_table_inputs( loss.block.program, table_name) @@ -67,12 +89,12 @@ class DownpourSGD(object): grads.append(i[1]) server.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - server.add_dense_table(dense_table_index, self.learning_rate_, - params, grads) + server.add_dense_table(dense_table_index, self.learning_rate_, params, + grads) worker.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - worker.add_dense_table(dense_table_index, self.learning_rate_, - params, grads) + worker.add_dense_table(dense_table_index, self.learning_rate_, params, + grads) ps_param = pslib.PSParameter() ps_param.server_param.CopyFrom(server.get_desc()) ps_param.trainer_param.CopyFrom(worker.get_desc()) @@ -80,5 +102,4 @@ class DownpourSGD(object): # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param.trainer_param.skip_op.extend(worker_skipped_ops) - ps_param_str = text_format.MessageToString(ps_param) return [ps_param, worker_skipped_ops] From dc2ff42e20d72a449b200da0522b55a53b28091d Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 13 Dec 2018 23:14:10 +0800 Subject: [PATCH 314/719] add math in python examples. test=develop --- python/paddle/fluid/layers/nn.py | 40 +++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 07fc4ccc6b..4a557ce247 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2536,16 +2536,28 @@ def adaptive_pool2d(input, ValueError: 'pool_size' should be a list or tuple with length as 2. Examples: - .. code-block:: python + # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], + # output shape is [N, C, m, n], adaptive pool divide H and W dimentions + # of input data into m * n grids averagely and performs poolings in each + # grid to get output. + # adaptive average pool performs calculations as follow: + # + # for i in range(m): + # for j in range(n): + # hstart = floor(i * H / m) + # hend = ceil((i + 1) * H / m) + # wstart = floor(i * W / n) + # wend = ceil((i + 1) * W / n) + # output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend]) + # data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') pool_out = fluid.layers.adaptive_pool2d( input=data, pool_size=[3, 3], - pool_type='max', - require_index=False) + pool_type='avg') """ if pool_type not in ["max", "avg"]: raise ValueError( @@ -2632,16 +2644,32 @@ def adaptive_pool3d(input, ValueError: 'pool_size' should be a list or tuple with length as 2. Examples: - .. code-block:: python + # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n], + # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions + # of input data into l * m * n grids averagely and performs poolings in each + # grid to get output. + # adaptive average pool performs calculations as follow: + # + # for i in range(l): + # for j in range(m): + # for k in range(n): + # dstart = floor(i * D / l) + # dend = ceil((i + 1) * D / l) + # hstart = floor(j * H / m) + # hend = ceil((j + 1) * H / m) + # wstart = floor(k * W / n) + # wend = ceil((k + 1) * W / n) + # output[:, :, i, j, k] = + # avg(input[:, :, dstart:dend, hstart: hend, wstart: wend]) + # data = fluid.layers.data( name='data', shape=[3, 32, 32], dtype='float32') pool_out, mask = fluid.layers.adaptive_pool3d( input=data, pool_size=[3, 3], - pool_type='max', - require_index=True) + pool_type='avg') """ if pool_type not in ["max", "avg"]: raise ValueError( From c624417c6f5f1d61ab539aa9c88e95b929a19054 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 14 Dec 2018 11:27:14 +0800 Subject: [PATCH 315/719] change sparse mode to lazy mode --- paddle/fluid/operators/optimizers/adam_op.cc | 2 +- paddle/fluid/operators/optimizers/adam_op.h | 12 ++++++------ python/paddle/fluid/optimizer.py | 6 +++--- .../paddle/fluid/tests/unittests/test_adam_op.py | 16 ++++++++-------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index b2c2e5c325..7993224327 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -111,7 +111,7 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "Constant for numerical stability") .SetDefault(1.0e-8f); AddAttr( - "sparse_mode", + "lazy_mode", "(bool, default false) " "only update the parameter that has gradient in sparse update") .SetDefault(false); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 25e23c5f9d..5870557bb7 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -177,13 +177,13 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; int64_t row_count_; - bool sparse_mode_; + bool lazy_mode_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count, bool sparse_mode) + int64_t row_numel, int64_t row_count, bool lazy_mode) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -200,7 +200,7 @@ struct SparseAdamFunctor { rows_(rows), row_numel_(row_numel), row_count_(row_count), - sparse_mode_(sparse_mode) {} + lazy_mode_(lazy_mode) {} inline HOSTDEVICE void adam_update(size_t i, T g) const { // The following code is the same as dense @@ -245,7 +245,7 @@ class AdamOpKernel : public framework::OpKernel { using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; - bool sparse_mode = ctx.Attr("sparse_mode"); + bool lazy_mode = ctx.Attr("lazy_mode"); T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); @@ -357,8 +357,8 @@ class AdamOpKernel : public framework::OpKernel { mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size(), sparse_mode); - if (sparse_mode) { + grad_merge.rows().size(), lazy_mode); + if (lazy_mode) { size_t row_count = grad_merge.rows().size(); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9c7482bc40..c53bf4913a 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -664,7 +664,7 @@ class AdamOptimizer(Optimizer): epsilon=1e-8, regularization=None, name=None, - sparse_mode=False): + lazy_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -677,7 +677,7 @@ class AdamOptimizer(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon - self._sparse_mode = sparse_mode + self._lazy_mode = lazy_mode def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -732,7 +732,7 @@ class AdamOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon, - "sparse_mode": self._sparse_mode + "lazy_mode": self._lazy_mode }) return adam_op diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index da91875a14..461196689c 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -195,7 +195,7 @@ def adam_step(inputs, attributes): def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, - sparse_mode): + lazy_mode): ''' Simulate one step of the adam optimizer :param inputs: dict of inputs @@ -231,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, class TestSparseAdamOp(unittest.TestCase): - def setup(self, scope, place, sparse_mode): + def setup(self, scope, place, lazy_mode): beta1 = 0.78 beta2 = 0.836 epsilon = 1e-4 @@ -265,19 +265,19 @@ class TestSparseAdamOp(unittest.TestCase): param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, height, rows, row_numel, - np_array, sparse_mode) + np_array, lazy_mode) self.outputs = { "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } - def check_with_place(self, place, sparse_mode): + def check_with_place(self, place, lazy_mode): scope = core.Scope() - self.setup(scope, place, sparse_mode) + self.setup(scope, place, lazy_mode) op_args = dict() - op_args['sparse_mode'] = sparse_mode + op_args['lazy_mode'] = lazy_mode for key, np_array in self.dense_inputs.items(): var = scope.var(key).get_tensor() var.set(np_array, place) @@ -313,8 +313,8 @@ class TestSparseAdamOp(unittest.TestCase): if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: - for sparse_mode in (True, False): - self.check_with_place(place, sparse_mode) + for lazy_mode in (True, False): + self.check_with_place(place, lazy_mode) if __name__ == "__main__": From eb5d427d3940cd53500fc4003c66ad37ef1738db Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 14 Dec 2018 11:37:39 +0800 Subject: [PATCH 316/719] add comment for lazy_mode adam optimizer --- python/paddle/fluid/optimizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index c53bf4913a..59c22d4e49 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. + regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. + lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators + the accumulators are updated at every step. Every element of the two moving-average is updated + in both dense mode and sparse mode. If the size of parameter is very large, then the update + may be very slow. The lazy mode only update the element that has gradient is the current + mini-batch, so it will be much more faster. But this mode has different semantics with the + original Adam algorithm and may lead to different result. Examples: .. code-block:: python From caa6b596775380b568ff934c24d4c641652e8fcc Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 11:43:23 +0800 Subject: [PATCH 317/719] add hdfs_utils & helper & node doc --- .../paddle/fluid/contrib/utils/hdfs_utils.py | 163 +++++++++++++----- python/paddle/fluid/distributed/helper.py | 34 +++- python/paddle/fluid/distributed/node.py | 113 +++++++++--- 3 files changed, 238 insertions(+), 72 deletions(-) diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index 42b4d7feab..baea57ccce 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -32,6 +32,28 @@ _logger.setLevel(logging.INFO) class HDFSClient(object): + """ + A tool of HDFS + + Args: + hadoop_home (string): hadoop_home + configs (dict): hadoop config, it is a dict, please contain \ + key "fs.default.name" and "hadoop.job.ugi" + Can be a float value + Examples: + hadoop_home = "/home/client/hadoop-client/hadoop/" + + configs = { + "fs.default.name": "hdfs://xxx.hadoop.com:54310", + "hadoop.job.ugi": "hello,hello123" + } + + client = HDFSClient(hadoop_home, configs) + + client.ls("/user/com/train-25") + files = client.lsr("/user/com/train-25/models") + """ + def __init__(self, hadoop_home, configs): self.pre_commands = [] hadoop_bin = '%s/bin/hadoop' % hadoop_home @@ -55,7 +77,10 @@ class HDFSClient(object): whole_commands = " ".join(whole_commands) for x in range(retry_times + 1): proc = subprocess.Popen( - whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + whole_commands, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) (output, errors) = proc.communicate() ret_code, ret_out, ret_err = proc.returncode, output, errors if ret_code: @@ -69,10 +94,12 @@ class HDFSClient(object): def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5): """ upload the local file to hdfs - args: - local_file_path: the local file path - remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) - return: + Args: + hdfs_path: hdfs path, target path + local_path: local file path, source path + overwrite: will overwrite the original file + retry_times: max times retry to upload + Returns: True or False """ assert hdfs_path is not None @@ -115,10 +142,12 @@ class HDFSClient(object): def download(self, hdfs_path, local_path, overwrite=False, unzip=False): """ download from hdfs - args: - local_file_path: the local file path - remote_file_path: remote dir on hdfs - return: + Args: + hdfs_path: hdfs path, target path + local_path: local file path, source path + overwrite: will remove original file and overwrite it. + unzip: ignore this param + Returns True or False """ _logger.info('Downloading %r to %r.', hdfs_path, local_path) @@ -160,11 +189,11 @@ class HDFSClient(object): def is_exist(self, hdfs_path=None): """ whether the remote hdfs path exists? - args: - remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) + Args: + hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) fs_name: The default values are the same as in the job configuration fs_ugi: The default values are the same as in the job configuration - return: + Returns: True or False """ exist_cmd = ['-test', '-e', hdfs_path] @@ -183,11 +212,11 @@ class HDFSClient(object): def is_dir(self, hdfs_path=None): """ whether the remote hdfs path exists? - args: + Args: remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) fs_name: The default values are the same as in the job configuration fs_ugi: The default values are the same as in the job configuration - return: + Returns: True or False """ @@ -207,15 +236,17 @@ class HDFSClient(object): return True def delete(self, hdfs_path): - """Remove a file or directory from HDFS. - - :param hdfs_path: HDFS path. - :param recursive: Recursively delete files and directories. By default, - this method will raise an :class:`HdfsError` if trying to delete a - non-empty directory. + """ + Remove a file or directory from HDFS. - This function returns `True` if the deletion was successful and `False` if - no file or directory previously existed at `hdfs_path`. + Args: + param hdfs_path: HDFS path. + param recursive: Recursively delete files and directories. By default, + this method will raise an :class:`HdfsError` if trying to delete a + non-empty directory. + Returns: + This function returns `True` if the deletion was successful and `False` if + no file or directory previously existed at `hdfs_path`. """ _logger.info('Deleting %r.', hdfs_path) @@ -241,14 +272,17 @@ class HDFSClient(object): return True def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False): - """Move a file or folder. - - :param hdfs_src_path: Source path. - :param hdfs_dst_path: Destination path. If the path already exists and is - a directory, the source will be moved into it. If the path exists and is - a file, or if a parent destination directory is missing, this method will - raise an :class:`HdfsError`. - + """ + Rename a file or folder. + Args: + :param hdfs_src_path: Source path. + :param hdfs_dst_path: Destination path. If the path already exists and is + a directory, the source will be moved into it. If the path exists and is + a file, or if a parent destination directory is missing, this method will + raise an :class:`HdfsError`. + Returns: + This function returns `True` if the rename was successful and `False` if + rename was faild. """ assert hdfs_src_path is not None assert hdfs_dst_path is not None @@ -274,6 +308,11 @@ class HDFSClient(object): @staticmethod def make_local_dirs(local_path): + """ + create a directiory local, is same to mkdir + Args: + local_path: local path that wants to create a directiory. + """ try: os.makedirs(local_path) except OSError as e: @@ -282,9 +321,11 @@ class HDFSClient(object): def makedirs(self, hdfs_path): """Create a remote directory, recursively if necessary. - - :param hdfs_path: Remote path. Intermediate directories will be created - appropriately. + Args: + :param hdfs_path: Remote path. Intermediate directories will be created + appropriately. + Returns: + True if make a directories was successful, False when make a directiries was failed. """ _logger.info('Creating directories to %r.', hdfs_path) assert hdfs_path is not None @@ -304,6 +345,13 @@ class HDFSClient(object): return True def ls(self, hdfs_path): + """ + ls a hdfs_path. + Args: + :param hdfs_path: hdfs_path will be ls. + Returns: + This function returns a `list` that contaion all files in the hdfs_path. + """ assert hdfs_path is not None if not self.is_exist(hdfs_path): @@ -329,6 +377,14 @@ class HDFSClient(object): return ret_lines def lsr(self, hdfs_path, only_file=True, sort=True): + """ + ls a hdfs_path sort by time. + Args: + :param hdfs_path: hdfs_path will be ls. + Returns: + This function returns a `list` that contaion all files sorted by time in the hdfs_path. + """ + def sort_by_time(v1, v2): v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M') v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M') @@ -372,12 +428,15 @@ def multi_upload(client, multi_processes=5, overwrite=False): """ - :param overwrite: will overwrite hdfs file or not - :param multi_processes: the upload data process at the same time, default=5 - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local - :return: + Upload file to hdfs. + Args: + :param overwrite: will overwrite hdfs file or not + :param multi_processes: the upload data process at the same time, default=5 + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + Returns: + """ def __subprocess_upload(datas): @@ -387,6 +446,13 @@ def multi_upload(client, client.upload(hdfs_re_path, data, overwrite, retry_times=5) def get_local_files(path): + """ + Get all local files + Args: + path: local file path + Returns: + A list that contation all files in the path. + """ rlist = [] if not os.path.isdir(path): @@ -431,14 +497,17 @@ def multi_download(client, multi_processes=5): """ multi_download - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local - :param trainer_id: current trainer id - :param trainers: all trainers number - :param file_cnt: all file number - :param multi_processes: the download data process at the same time, default=5 - :return: None + Args: + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + :param trainer_id: current trainer id + :param trainers: all trainers number + :param file_cnt: all file number + :param multi_processes: the download data process at the same time, default=5 + :return: None + Returns: + A list that be downloaded. """ def __subprocess_download(datas): diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 986525e5d8..ca6dd5dabf 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -15,13 +15,26 @@ from mpi4py import MPI import ps_pb2 as pslib + class FileSystem(object): - def __init__(self, fs_type="afs", + """ + A file system that support async_executor hadoop client desc. + + Args: + fs_type (string): fs_type, for example is "afs" + user (string): hadoop param + passwd (string): hadoop param + hadoop bin (string): hadoop param + Examples: + fs = FileSystm() + """ + + def __init__(self, + fs_type="afs", uri="afs://tianqi.afs.baidu.com:9902", user=None, passwd=None, - hadoop_bin="", - afs_conf=None): + hadoop_bin=""): assert user != None assert passwd != None assert hadoop_bin != None @@ -38,9 +51,22 @@ class FileSystem(object): #self.fs_client.afs_conf = afs_conf if not afs_conf else "" def get_desc(self): + """ + get hadoop desc. + """ return self.fs_client + class MPIHelper(object): + """ + MPIHelper is a wrapper of mpi4py, supprot get_rank get_size etc. + Args: + No params + Examples: + mh = MPIHelper() + mh.get_ip() + """ + def __init__(self): self.comm = MPI.COMM_WORLD @@ -61,5 +87,3 @@ class MPIHelper(object): def finalize(self): MPI.Finalize() - - diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 8755323006..117da9cff8 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -13,17 +13,34 @@ import ps_pb2 as pslib + class Server(object): + """ + A Server basic class. + """ + def __init__(self): pass class Worker(object): + """ + A Worker basic class. + """ + def __init__(self): pass class DownpourServer(Server): + """ + DownpourServer class is used to generate server program_desc + Args: + server: it is pslib.ServerParameter() + Examples: + server = DownpourServer() + """ + def __init__(self): self.server_ = pslib.ServerParameter() self.server_.downpour_server_param.service_param.start_server_port = 0 @@ -33,8 +50,18 @@ class DownpourServer(Server): self.server_.downpour_server_param.service_param.start_server_port = 0 self.server_.downpour_server_param.service_param.server_thread_num = 12 - def add_sparse_table(self, table_id, learning_rate, - slot_key_vars, slot_value_var): + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.table_class = "DownpourSparseTable" @@ -44,10 +71,10 @@ class DownpourServer(Server): table.accessor.sparse_sgd_param.initial_g2sum = 3 table.accessor.sparse_sgd_param.initial_range = 1e-4 table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10]) - + table.accessor.embedx_dim = 8 table.accessor.embedx_threshold = 5 - table.accessor.fea_dim = 11 + table.accessor.fea_dim = 11 #table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, # slot_value_var[0].shape, 1)) table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 @@ -58,53 +85,99 @@ class DownpourServer(Server): table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999 table.accessor.downpour_accessor_param.delete_threshold = 0.8 - def add_dense_table(self, table_id, learning_rate, - param_var, grad_var): + def add_dense_table(self, table_id, learning_rate, param_var, grad_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ table = self.server_.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.table_class = "DownpourDenseTable" table.type = pslib.PS_DENSE_TABLE table.accessor.accessor_class = "DownpourDenseValueAccessor" - table.accessor.dense_sgd_param.name = "adam" + table.accessor.dense_sgd_param.name = "adam" table.accessor.dense_sgd_param.adam.learning_rate = learning_rate - table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 - table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 + table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 + table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8 table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 fea_dim = 0 - for param in filter(lambda x: x.name.find("embedding") == -1, param_var): + for param in filter(lambda x: x.name.find("embedding") == -1, + param_var): fea_dim += reduce(lambda x, y: x * y, param.shape, 1) table.accessor.fea_dim = fea_dim def get_desc(self): + """ + Return downpour server program_desc + """ return self.server_ class DownpourWorker(Worker): + """ + DownpourWorker class is used to generate worker program_desc + Args: + window (int): push params frequency + worker: it is pslib.DownpourTrainerParameter + Examples: + worker = DownpourWorker(1) + """ + def __init__(self, window): self.window = window self.worker_ = pslib.DownpourTrainerParameter() #self.worker_.pull_dense_per_batch = window #self.worker_.push_dense_per_batch = window - def add_sparse_table(self, table_id, learning_rate, - slot_key_vars, slot_value_vars): + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ table = self.worker_.sparse_table.add() table.table_id = table_id - table.slot_key.extend( - [var.name for var in slot_key_vars]) - table.slot_value.extend( - [var.name for var in slot_value_vars]) + table.slot_key.extend([var.name for var in slot_key_vars]) + table.slot_value.extend([var.name for var in slot_value_vars]) table.slot_gradient.extend( [var.name + "@GRAD" for var in slot_value_vars]) - def add_dense_table(self, table_id, learning_rate, - param_vars, grad_vars): + def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ table = self.worker_.dense_table.add() table.table_id = table_id - table.dense_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [p.name for p in param_vars])) - table.dense_gradient_variable_name.extend(filter(lambda x: x.find("embedding") == -1, [g.name for g in grad_vars])) + table.dense_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [p.name for p in param_vars])) + table.dense_gradient_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [g.name for g in grad_vars])) def get_desc(self): + """ + Return downpour worker program_desc + """ return self.worker_ From b17444c84c45ae7f04863964099412a8ad9bf8d0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 14 Dec 2018 12:33:05 +0800 Subject: [PATCH 318/719] Fix merge bug test=develop --- paddle/fluid/operators/psroi_pool_op.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index 6978d9c5dc..78989582b7 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -129,9 +129,8 @@ class PSROIPoolOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -150,9 +149,8 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; From 35ce6ac2e6f1d71da55da74e49212fdbb2a61e79 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 13:18:28 +0800 Subject: [PATCH 319/719] add ps_instance doc --- .../paddle/fluid/distributed/ps_instance.py | 58 ++++++++++++++++--- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index b93da053a3..6b44d0cd16 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -14,27 +14,36 @@ import helper as dist_helper import sys + class PaddlePSInstance(object): + """ + PaddlePSInstance class is used to generate A instance of server or worker + Args: + server_worker_mode: is a value 0 or 1, default is 1 + proc_per_node: process per node, default is 2 + Examples: + instance = PaddlePSInstance(1, 2) + """ + def __init__(self, server_worker_mode, proc_per_node): self.dh = dist_helper.MPIHelper() self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node self._nodes = self.dh.get_size() - + self._ip = 0 self._worker_num = self._nodes * self._proc_per_node / 2 self._server_num = self._nodes * self._proc_per_node / 2 self._total_server_worker = self._worker_num + self._server_num - self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 + self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 self._set_nodetype() self._comm = None self._split_comm() - def _set_nodetype(self): if self._server_worker_mode == 0: - if self._rankid < self._server_num: + if self._rankid < self._server_num: self._node_type = 1 elif self._rankid < self._total_server_worker: self._node_type = 0 @@ -46,13 +55,13 @@ class PaddlePSInstance(object): self._node_type = 0 else: self._node_type = 1 - else: - self._node_type = -1; + else: + self._node_type = -1 else: self._node_type = -1 - + #if self._rankid == 0: - #print "node type: ", self._node_type + #print "node type: ", self._node_type def _split_comm(self): if self.is_server(): @@ -62,45 +71,78 @@ class PaddlePSInstance(object): pass def get_worker_index(self): + """ + Return worker index + """ if self._server_worker_mode == 0: return self._rankid == self.server_num else: return self._rankid / self._proc_per_node def get_server_index(self): + """ + Return server index + """ if self._server_worker_mode == 0: return self.rank_id else: return self.rank_id / self._proc_per_node def is_worker(self): + """ + Return instance is worker or not + """ return self._node_type == 1 def is_server(self): + """ + Return instance is server or not + """ return self._node_type == 0 def is_first_worker(self): + """ + Return instance is first worker or not + """ return self.is_worker() and 0 == self.get_worker_index() def set_ip(self, ip): + """ + set server ip + """ self._ip = ip def gather_ips(self): + """ + Return all servers and workers ip throught mpi allgather + """ self._ips = self.dh.comm.allgather(self._ip) return self._ips def get_node_cnt(self): + """ + Return node cnt + """ return self._nodes def barrier_all(self): + """ + barrier workers and servers + """ self.dh.comm.barrier() def barrier_worker(self): + """ + barrier workers + """ if self.is_worker(): self._comm.barrier() pass def finalize(self): + """ + MPI finalize + """ self.dh.finalize() pass From f16aa394f61cd759562ecfb5d3553700932d71b1 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 14 Dec 2018 12:14:37 +0800 Subject: [PATCH 320/719] remove use_cudnn in python API. test=develop --- paddle/fluid/API.spec | 4 ++-- python/paddle/fluid/layers/nn.py | 22 ---------------------- 2 files changed, 2 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 845abe7d5b..d67363003a 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -77,8 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) -paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4a557ce247..28a119906b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2506,7 +2506,6 @@ def adaptive_pool2d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=False, name=None): """ ${comment} @@ -2521,7 +2520,6 @@ def adaptive_pool2d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2530,8 +2528,6 @@ def adaptive_pool2d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. - ValueError: 'use_cudnn' is not a bool value. - ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2575,12 +2571,6 @@ def adaptive_pool2d(input, raise ValueError( "'pool_size' should be a list or tuple with length as 2.") - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False.") - - if use_cudnn: - raise ValueError("adaptive pool currently not supported in cudnn.") - if pool_type == "max": l_type = 'max_pool2d_with_index' else: @@ -2602,7 +2592,6 @@ def adaptive_pool2d(input, attrs={ "pooling_type": pool_type, "ksize": pool_size, - "use_cudnn": use_cudnn, "adaptive": True, }) @@ -2614,7 +2603,6 @@ def adaptive_pool3d(input, pool_size, pool_type="max", require_index=False, - use_cudnn=False, name=None): """ ${comment} @@ -2629,7 +2617,6 @@ def adaptive_pool3d(input, pool_type: ${pooling_type_comment} require_index (bool): If true, the index of max pooling point along with outputs. it cannot be set in average pooling type. - use_cudnn (bool, default False): adaptive pool currently not supported in cudnn. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2638,8 +2625,6 @@ def adaptive_pool3d(input, Raises: ValueError: 'pool_type' is not 'max' nor 'avg'. - ValueError: 'use_cudnn' is not a bool value. - ValueError: adaptive pool currently not supported in cudnn. ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. ValueError: 'pool_size' should be a list or tuple with length as 2. @@ -2687,12 +2672,6 @@ def adaptive_pool3d(input, raise ValueError( "'pool_size' should be a list or tuple with length as 3.") - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False.") - - if use_cudnn: - raise ValueError("adaptive pool currently not supported in cudnn.") - if pool_type == "max": l_type = 'max_pool3d_with_index' else: @@ -2714,7 +2693,6 @@ def adaptive_pool3d(input, attrs={ "pooling_type": pool_type, "ksize": pool_size, - "use_cudnn": use_cudnn, "adaptive": True, }) From da796dfe05eb48c3f5541a43d8fb193153333ca0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 13:29:02 +0800 Subject: [PATCH 321/719] Remove BinarySearch from Adam Op test=develop --- .../operators/math/selected_rows_functor.cc | 4 + .../operators/math/selected_rows_functor.h | 7 ++ paddle/fluid/operators/optimizers/adam_op.h | 119 +++++++++++++++--- 3 files changed, 114 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 3eba268cfa..0c2e6d4024 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -301,6 +302,9 @@ struct MergeAdd { } std::vector merge_rows(merged_row_set.begin(), merged_row_set.end()); + if (sorted_result_) { + std::sort(merge_rows); + } std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { rows_to_id[merge_rows[i]] = i; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 6d146d39d6..b7b19f130e 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -78,6 +78,10 @@ namespace scatter { // functors for manuplating SelectedRows data template struct MergeAdd { + MergeAdd() : sorted_result_(false) {} + + explicit MergeAdd(bool sorted_result) : sorted_result_(sorted_result) {} + // unary functor, merge by adding duplicated rows in // the input SelectedRows object. framework::SelectedRows operator()(const DeviceContext& context, @@ -88,6 +92,9 @@ struct MergeAdd { void operator()(const DeviceContext& context, const std::vector& inputs, framework::SelectedRows* output); + + private: + bool sorted_result_; }; enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54..c2bf7040d7 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -158,7 +158,7 @@ struct AdamFunctor { }; template -struct SparseAdamFunctor { +struct SparseAdamFunctor { T beta1_; T beta2_; T epsilon_; @@ -227,6 +227,78 @@ struct SparseAdamFunctor { } }; +template +struct SparseAdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + + SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, + const T* mom2, T* mom2_out, const T* lr, const T* grad, + const T* param, T* param_out, const int64_t* rows, + int64_t row_numel, int64_t row_count) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count) {} + + inline void operator()(size_t numel) const { + // lr could be reuse + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + for (size_t i = 0U, j = 0U; i != numel; ++i) { + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T p = param_[i]; + + // Calculation + if (i == *(rows_ + j)) { + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + ++j; + } else { + mom1 = beta1_ * mom1; + mom2 = beta2_ * mom2; + } + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } + } +}; + template class AdamOpKernel : public framework::OpKernel { public: @@ -316,7 +388,7 @@ class AdamOpKernel : public framework::OpKernel { } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func; + scatter::MergeAdd merge_func(true); auto* grad_merge_var = const_cast(ctx.scope()) .Var() ->GetMutable(); @@ -337,25 +409,40 @@ class AdamOpKernel : public framework::OpKernel { } else { #endif rows = grad_merge.rows().data(); - #if defined(PADDLE_WITH_CUDA) } #endif auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - SparseAdamFunctor functor( - beta1, beta2, epsilon, beta1_pow.template data(), - beta2_pow.template data(), mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - lr.template data(), grad_data, param.template data(), - param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size()); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + if (platform::is_cpu_place(ctx.GetPlace())) { + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + grad_merge.rows().size()); + + functor(param.numel()); + } else if (platform::is_gpu_place(ctx.GetPlace())) { + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + grad_merge.rows().size()); + + // FIXME(minqiyang): remove BinarySearch in GPU later + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else { PADDLE_THROW("Variable type not supported by adam_op"); } From bafd823666bde1098cf07eb23d406bc9780c7b28 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 13:18:28 +0800 Subject: [PATCH 322/719] test --- .../paddle/fluid/distributed/ps_instance.py | 58 ++++++++++++++++--- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index b93da053a3..6b44d0cd16 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -14,27 +14,36 @@ import helper as dist_helper import sys + class PaddlePSInstance(object): + """ + PaddlePSInstance class is used to generate A instance of server or worker + Args: + server_worker_mode: is a value 0 or 1, default is 1 + proc_per_node: process per node, default is 2 + Examples: + instance = PaddlePSInstance(1, 2) + """ + def __init__(self, server_worker_mode, proc_per_node): self.dh = dist_helper.MPIHelper() self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node self._nodes = self.dh.get_size() - + self._ip = 0 self._worker_num = self._nodes * self._proc_per_node / 2 self._server_num = self._nodes * self._proc_per_node / 2 self._total_server_worker = self._worker_num + self._server_num - self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 + self._node_type = None #IDLE=-1, WORKER=1, SERVER=0 self._set_nodetype() self._comm = None self._split_comm() - def _set_nodetype(self): if self._server_worker_mode == 0: - if self._rankid < self._server_num: + if self._rankid < self._server_num: self._node_type = 1 elif self._rankid < self._total_server_worker: self._node_type = 0 @@ -46,13 +55,13 @@ class PaddlePSInstance(object): self._node_type = 0 else: self._node_type = 1 - else: - self._node_type = -1; + else: + self._node_type = -1 else: self._node_type = -1 - + #if self._rankid == 0: - #print "node type: ", self._node_type + #print "node type: ", self._node_type def _split_comm(self): if self.is_server(): @@ -62,45 +71,78 @@ class PaddlePSInstance(object): pass def get_worker_index(self): + """ + Return worker index + """ if self._server_worker_mode == 0: return self._rankid == self.server_num else: return self._rankid / self._proc_per_node def get_server_index(self): + """ + Return server index + """ if self._server_worker_mode == 0: return self.rank_id else: return self.rank_id / self._proc_per_node def is_worker(self): + """ + Return instance is worker or not + """ return self._node_type == 1 def is_server(self): + """ + Return instance is server or not + """ return self._node_type == 0 def is_first_worker(self): + """ + Return instance is first worker or not + """ return self.is_worker() and 0 == self.get_worker_index() def set_ip(self, ip): + """ + set server ip + """ self._ip = ip def gather_ips(self): + """ + Return all servers and workers ip throught mpi allgather + """ self._ips = self.dh.comm.allgather(self._ip) return self._ips def get_node_cnt(self): + """ + Return node cnt + """ return self._nodes def barrier_all(self): + """ + barrier workers and servers + """ self.dh.comm.barrier() def barrier_worker(self): + """ + barrier workers + """ if self.is_worker(): self._comm.barrier() pass def finalize(self): + """ + MPI finalize + """ self.dh.finalize() pass From bd1c1724aabc2b1d2d30ae6ac159df297b6c7f54 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 13:18:28 +0800 Subject: [PATCH 323/719] add ps_instance doc --- paddle/fluid/framework/CMakeLists.txt | 15 +- paddle/fluid/framework/async_executor.cc | 114 ++++++------ .../fluid/framework/executor_thread_worker.cc | 165 +++++++++--------- .../fluid/framework/executor_thread_worker.h | 124 ++++++------- 4 files changed, 199 insertions(+), 219 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index f3d66cd883..ab237f768a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,6 +1,6 @@ -# windows treat symbolic file as a real file, which is different with unix -# We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) @@ -11,7 +11,7 @@ function(windows_symbolic TARGET) message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") endif() - # only copy the xx.cu to .xx.cu when the content are modified +#only copy the xx.cu to.xx.cu when the content are modified set(copy_flag 1) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) @@ -32,7 +32,7 @@ endfunction() add_subdirectory(ir) add_subdirectory(details) -# ddim lib +#ddim lib proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) @@ -89,8 +89,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU) if (WIN32) - # windows treat symbolic file as a real file, which is different with unix - # We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. windows_symbolic(hidden_file SRCS data_type_transform.cu) nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) add_dependencies(data_type_transform hidden_file) @@ -137,7 +137,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) -# Generate an empty __init__.py to make framework_py_proto as a valid python module. +#Generate an empty \ + __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 0fe7f3bd5c..e2756cafa2 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" #ifdef PADDLE_WITH_PSLIB -#include "pslib.h" +#include #endif namespace paddle { @@ -70,50 +70,52 @@ void PrepareReaders(std::vector>& readers, // NOLINT #ifdef PADDLE_WITH_PSLIB void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { - _pslib_ptr = - std::shared_ptr( - new paddle::distributed::PSlib()); - _pslib_ptr->init_server(dist_desc, index); - InitParamConfig(); + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_server(dist_desc, index); + InitParamConfig(); } void AsyncExecutor::InitWorker(const std::string& dist_desc, const std::vector& host_sign_list, int node_num, int index) { - _pslib_ptr = std::shared_ptr( - new paddle::distributed::PSlib()); - _pslib_ptr->init_worker( - dist_desc, (uint64_t*)(host_sign_list.data()), node_num, index); + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_worker(dist_desc, + static_cast(host_sign_list.data()), + node_num, index); - InitParamConfig(); + InitParamConfig(); } -uint64_t AsyncExecutor::StartServer() { - return _pslib_ptr->run_server(); -} +uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); } -void AsyncExecutor::StopServer() { - _pslib_ptr->stop_server(); -} +void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } -void AsyncExecutor::GatherServers( - const std::vector& host_sign_list, int node_num) { - _pslib_ptr->gather_servers((uint64_t*)(host_sign_list.data()), node_num); +void AsyncExecutor::GatherServers(const std::vector& host_sign_list, + int node_num) { + _pslib_ptr->gather_servers(static_cast(host_sign_list.data()), + node_num); } void AsyncExecutor::InitParamConfig() { - for (int i = 0; i < - _pslib_ptr->get_param()->server_param(). \ - downpour_server_param(). \ - downpour_table_param_size(); + for (int i = 0; i < _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param_size(); ++i) { - if (_pslib_ptr->get_param()->server_param(). \ - downpour_server_param().downpour_table_param(i). \ - table_class().find("SparseTable") != -1) { - _param_config.fea_dim = _pslib_ptr->get_param()->server_param(). \ - downpour_server_param(). \ - downpour_table_param(i). \ - accessor().fea_dim(); + if (_pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .table_class() + .find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .accessor() + .fea_dim(); break; } } @@ -122,28 +124,24 @@ void AsyncExecutor::InitParamConfig() { _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); _param_config.tmp_push_sparse_wait_times = static_cast( _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); ++t) { _param_config.skip_op.push_back( _pslib_ptr->get_param()->trainer_param().skip_op(t)); } - + for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); - ++t) { + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); std::vector tmp_sparse_variable_name; for (int i = 0u; i < table.slot_value_size(); ++i) { tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_key(i)] = - table.table_id(); + _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); } std::vector tmp_sparse_gradient_variable_name; for (auto i = 0u; i < table.slot_gradient_size(); ++i) { - tmp_sparse_gradient_variable_name.push_back( - table.slot_gradient(i)); + tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i)); } _param_config.slot_input_vec[table.table_id()] = std::move(tmp_sparse_variable_name); @@ -151,10 +149,9 @@ void AsyncExecutor::InitParamConfig() { std::move(tmp_sparse_gradient_variable_name); _param_config.sparse_table_id.push_back(table.table_id()); } - + for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); - ++t) { + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); std::vector tmp_dense_variable_name; for (int i = 0u; i < table.dense_variable_name_size(); ++i) { @@ -181,26 +178,25 @@ void AsyncExecutor::InitModel() { Variable* var = root_scope_->FindVar(t); CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); - + float* g = tensor->data(); CHECK(g != nullptr) << "var[" << t << "] value not initialized"; float init_range = 0.2; int rown = tensor->dims()[0]; init_range /= sqrt(rown); - + std::normal_distribution ndistr(0.0, 1.0); for (auto i = 0u; i < tensor->numel(); ++i) { g[i] = ndistr(local_random_engine()) * init_range; } - + paddle::ps::Region reg(g, tensor->numel()); regions.emplace_back(std::move(reg)); } - - auto push_status = - _pslib_ptr->_worker_ptr->push_dense_param( - regions.data(), regions.size(), table_id); + + auto push_status = _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); push_status.wait(); auto status = push_status.get(); if (status != 0) { @@ -225,14 +221,14 @@ void AsyncExecutor::SaveModel(const std::string& path) { void AsyncExecutor::PrepareDenseThread(const std::string& mode) { if (mode == "mpi") { DensePullThreadParam param; - param.ps_client = _pslib_ptr->_worker_ptr;; + param.ps_client = _pslib_ptr->_worker_ptr; param.threshold = 1; param.training_thread_num = actual_thread_num; param.root_scope = root_scope_; param.dense_params = &_param_config.dense_variable_name; - - _pull_dense_thread = std::shared_ptr( - new DensePullThread(param)); + + _pull_dense_thread = + std::shared_ptr(new DensePullThread(param)); _pull_dense_thread->start(); } } @@ -243,8 +239,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::vector& filelist, const int thread_num, const std::vector& fetch_var_names, - const std::string& mode, - const bool debug) { + const std::string& mode, const bool debug) { std::vector threads; auto& block = main_program.Block(0); @@ -293,9 +288,9 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& worker : workers) { #ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { - worker.reset(new AsyncExecutorThreadWorker); + worker.reset(new AsyncExecutorThreadWorker); } else { - worker.reset(new ExecutorThreadWorker); + worker.reset(new ExecutorThreadWorker); } #else worker.reset(new ExecutorThreadWorker); @@ -308,7 +303,6 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, fetch_var_names, root_scope_, thidx, debug); } - // start executing ops in multiple threads for (int thidx = 0; thidx < actual_thread_num; ++thidx) { threads.push_back( diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 59679842bc..a945562926 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor_thread_worker.h" +#include #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" @@ -51,7 +52,7 @@ void DensePullThread::run() { if (_pull_dense_status.size() != 0) { wait_all(); } - + usleep(_sleep_time_ms * 1000); } } @@ -77,12 +78,12 @@ std::future DensePullThread::pull_dense(uint64_t table_id) { regions.clear(); auto& variables = _dense_variable_name[table_id]; regions.resize(variables.size()); - + for (auto i = 0u; i < variables.size(); ++i) { auto& t = variables[i]; Variable* var = _root_scope->FindVar(t); LoDTensor* tensor = var->GetMutable(); - + float* w = tensor->data(); paddle::ps::Region reg(w, tensor->numel()); regions[i] = std::move(reg); @@ -95,21 +96,20 @@ void DensePullThread::wait_all() { t.wait(); auto status = t.get(); if (status != 0) { - LOG(WARNING) << "pull dense failed times:" << - ++_pull_dense_fail_times; + LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; } } - + if (_pull_dense_fail_times > 20) { LOG(FATAL) << "pull dense failed times more than 20 times"; exit(-1); } - + _pull_dense_status.resize(0); } -void DensePullThread::increase_thread_version( - int thread_id, uint64_t table_id) { +void DensePullThread::increase_thread_version(int thread_id, + uint64_t table_id) { std::lock_guard lock(_mutex_for_version); _training_versions[table_id][thread_id]++; } @@ -174,7 +174,6 @@ void ExecutorThreadWorker::SetFetchVarNames( fetch_var_names.end()); } - void ExecutorThreadWorker::SetDevice() { #if defined _WIN32 || defined __APPLE__ return; @@ -344,15 +343,14 @@ void AsyncExecutorThreadWorker::SetPullDenseThread( } void AsyncExecutorThreadWorker::TrainOneNetwork() { PrepareParams(); - + for (auto& op : ops_) { if (op->Type().find("sgd") != std::string::npos) { continue; } bool need_skip = false; for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { - if (op->Type().find(_param_config->skip_op[t]) != - std::string::npos) { + if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { need_skip = true; break; } @@ -436,14 +434,13 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) { paddle::ps::Region reg(g, count); regions.emplace_back(std::move(reg)); } - - auto status = _pslib_ptr->_worker_ptr->push_dense( - regions.data(), regions.size(), table_id); + + auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); _push_dense_status.push_back(std::move(status)); } void AsyncExecutorThreadWorker::PullSparse(int table_id) { - auto& features = _features[table_id]; auto& feature_value = _feature_value[table_id]; auto fea_dim = _param_config->fea_dim; @@ -451,8 +448,7 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.clear(); features.resize(0); features.reserve(MAX_FEASIGN_NUM); - const std::vector& feed_vec = - thread_reader_->GetUseSlotAlias(); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); @@ -468,20 +464,20 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { features.push_back(static_cast(ids[i])); } } - check_pull_push_memory(features, feature_value, fea_dim); - + check_pull_push_memory(features, &feature_value, fea_dim); + std::vector pull_feature_value; for (auto i = 0u; i < features.size(); ++i) { pull_feature_value.push_back(feature_value[i].data()); } - + auto status = _pslib_ptr->_worker_ptr->pull_sparse( pull_feature_value.data(), table_id, features.data(), features.size()); _pull_sparse_status.push_back(std::move(status)); - + auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - + check_pull_push_memory(features, &push_g, fea_dim); + collect_feasign_info(table_id); } @@ -490,15 +486,14 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; auto& fea_value = _feature_value[table_id]; - + CHECK(features.size() > 0) << "feature size check failed"; - + auto fea_idx = 0u; - + std::vector init_value(fea_dim); - - const std::vector& feed_vec = - thread_reader_->GetUseSlotAlias(); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); // slot_idx = 0 is label TODO for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); @@ -508,22 +503,22 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) { Variable* var_emb = thread_scope_->FindVar( _param_config->slot_input_vec[table_id][slot_idx - 1]); LoDTensor* tensor_emb = var_emb->GetMutable(); - float* ptr = tensor_emb->mutable_data( - {len, slot_dim}, platform::CPUPlace()); + float* ptr = + tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); memset(ptr, 0, sizeof(float) * len * slot_dim); auto& tensor_lod = tensor->lod()[0]; - + LoD data_lod{tensor_lod}; tensor_emb->set_lod(data_lod); - + for (auto index = 0u; index < len; ++index) { if (ids[index] == 0u) { - memcpy(ptr + slot_dim * index, - init_value.data() + 2, sizeof(float) * slot_dim); + memcpy(ptr + slot_dim * index, init_value.data() + 2, + sizeof(float) * slot_dim); continue; } - memcpy(ptr + slot_dim * index, - fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim); + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, + sizeof(float) * slot_dim); fea_idx++; } } @@ -534,35 +529,38 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { auto fea_dim = _param_config->fea_dim; auto& features = _features[table_id]; auto& push_g = _feature_push_value[table_id]; - check_pull_push_memory(features, push_g, fea_dim); - CHECK(push_g.size() == features.size() + 1) << - "push_g size:" << push_g.size() << " features size:" << features.size(); + check_pull_push_memory(features, &push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) + << "push_g size:" << push_g.size() + << " features size:" << features.size(); uint64_t fea_idx = 0u; auto& fea_info = _fea_info[table_id]; int offset = 2; const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); - // slot_idx = 0 is label + // slot_idx = 0 is label for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { - if (_param_config->slot_alias_to_table.find( - feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) { - LOG(ERROR) << "ERROR slot_idx:" << slot_idx << - " name:" << feed_vec[slot_idx]; - } else if ( - _param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) { + if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == + _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx + << " name:" << feed_vec[slot_idx]; + } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != + table_id) { continue; } Variable* g_var = thread_scope_->FindVar( _param_config->gradient_var[table_id][slot_idx - 1]); - CHECK(g_var != nullptr) << "var[" << - _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + CHECK(g_var != nullptr) + << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; LoDTensor* g_tensor = g_var->GetMutable(); if (g_tensor == NULL) { - LOG(ERROR) << "var[" << - _param_config->gradient_var[table_id][slot_idx - 1] << "] not found"; + LOG(ERROR) << "var[" + << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; exit(-1); } float* g = g_tensor->data(); - + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; LoDTensor* tensor = var->GetMutable(); @@ -571,42 +569,40 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) { exit(-1); } int len = tensor->numel(); - CHECK(slot_dim * len == g_tensor->numel()) << - "len:" << len << " g_numel:" << g_tensor->numel(); - CHECK(len == tensor->numel()) << "len:" << - len << "t_numel:" << tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) + << "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << len + << "t_numel:" << tensor->numel(); int64_t* ids = tensor->data(); for (auto id_idx = 0u; id_idx < len; ++id_idx) { if (ids[id_idx] == 0) { g += slot_dim; continue; } - memcpy(push_g[fea_idx].data() + offset, - g, sizeof(float) * slot_dim); + memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); push_g[fea_idx][0] = 1.0f; - CHECK(fea_idx < fea_info.size()) << "fea_idx:" << - fea_idx << " size:" << fea_info.size(); + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx + << " size:" << fea_info.size(); push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); g += slot_dim; fea_idx++; } } - CHECK(fea_idx == features.size()) << "fea_idx:" << - fea_idx << " features size:" << features.size(); + CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx + << " features size:" << features.size(); CHECK_GT(features.size(), 0); - + std::vector push_g_vec; for (auto i = 0u; i < features.size(); ++i) { push_g_vec.push_back(push_g[i].data()); } auto status = _pslib_ptr->_worker_ptr->push_sparse( - table_id, features.data(), - (const float**)push_g_vec.data(), features.size()); + table_id, features.data(), (const float**)push_g_vec.data(), + features.size()); _push_sparse_status.push_back(std::move(status)); } -void AsyncExecutorThreadWorker::collect_feasign_info( - int table_id) { +void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) { auto& fea_info = _fea_info[table_id]; auto& feature = _features[table_id]; fea_info.resize(feature.size()); @@ -614,13 +610,13 @@ void AsyncExecutorThreadWorker::collect_feasign_info( Variable* var = thread_scope_->FindVar(feed_vec[0]); LoDTensor* tensor = var->GetMutable(); int64_t* label = tensor->data(); - + int global_index = 0; for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); LoDTensor* tensor = var->GetMutable(); int64_t* ids = tensor->data(); - + int fea_idx = 0; for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { @@ -628,36 +624,33 @@ void AsyncExecutorThreadWorker::collect_feasign_info( continue; } FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; - + fea_info[global_index++] = std::move(info); } } } - CHECK(global_index == feature.size()) << - "expect fea info size:" << feature.size() - << " real:" << global_index; + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; } void AsyncExecutorThreadWorker::check_pull_push_memory( - const std::vector& features, - std::vector>& push_g, - int dim) { - push_g.resize(features.size() + 1); - for (auto& t : push_g) { + const std::vector& features, + std::vector>* push_g, int dim) { + push_g->resize(features.size() + 1); + for (auto& t : *push_g) { t.resize(dim); } } void AsyncExecutorThreadWorker::check_pull_push_memory( - const std::vector& features, - std::vector& push_g, + const std::vector& features, std::vector* push_g, int dim) { - if (features.size() > push_g.size()) { - push_g.reserve(features.size() + 1); - auto size = features.size() - push_g.size() + 1; + if (features.size() > push_g->size()) { + push_g->reserve(features.size() + 1); + auto size = features.size() - push_g->size() + 1; for (auto i = 0u; i < size; ++i) { float* ptr = new float[dim]; - push_g.push_back(ptr); + push_g->push_back(ptr); } } } diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 20410b4c06..30b81ad880 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #ifdef PADDLE_WITH_PSLIB -#include "pslib.h" +#include #endif namespace paddle { @@ -34,75 +34,74 @@ namespace framework { void CreateTensor(Variable* var, proto::VarType::Type var_type); #ifdef PADDLE_WITH_PSLIB -const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; +static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; struct AsyncWorkerParamConfig { int slot_dim; int fea_dim; int32_t tmp_push_dense_wait_times; int32_t tmp_push_sparse_wait_times; - + std::vector skip_op; - + std::map> dense_variable_name; std::map> dense_gradient_variable_name; - std::vector dense_table_id; + std::vector dense_table_id; // fea_dim for each dense table - std::vector dense_table_size; - std::vector sparse_table_id; + std::vector dense_table_size; + std::vector sparse_table_id; std::map> slot_input_vec; std::map> gradient_var; std::map slot_alias_to_table; }; struct DensePullThreadParam { - std::shared_ptr ps_client; - int threshold; - int training_thread_num; - Scope* root_scope; - std::map>* dense_params; - int sleep_time_ms = 2; + std::shared_ptr ps_client; + int threshold; + int training_thread_num; + Scope* root_scope; + std::map>* dense_params; + int sleep_time_ms = 2; }; class DensePullThread { public: - explicit DensePullThread(const DensePullThreadParam& param) : - _running(false) { + explicit DensePullThread(const DensePullThreadParam& param) + : _running(false) { _ps_client = param.ps_client; _threshold = param.threshold; _thread_num = param.training_thread_num; _root_scope = param.root_scope; _sleep_time_ms = param.sleep_time_ms; - + for (auto& t : *param.dense_params) { - _dense_variable_name[t.first].insert( - _dense_variable_name[t.first].end(), - t.second.begin(), t.second.end()); + _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); _training_versions[t.first].resize(_thread_num, 0); _last_versions[t.first] = 0; _current_version[t.first] = 0; } } - + int start(); - + void stop() { if (_running) { _running = false; _t.join(); } } - + void increase_thread_version(int thread_id, uint64_t table_id); void reset_thread_version(uint64_t table_id); std::future pull_dense(uint64_t table_id); void pull_dense2(uint64_t table_id); void wait_all(); - + private: void run(); bool check_update_param(uint64_t table_id); - + private: std::shared_ptr _ps_client; int _thread_num; @@ -113,33 +112,33 @@ class DensePullThread { std::map _last_versions; std::map _current_version; - std::mutex _mutex_for_version; + std::mutex _mutex_for_version; std::map> _training_versions; std::map> _dense_variable_name; - + std::thread _t; - + std::vector<::std::future> _pull_dense_status; - + std::map> _regions; - uint32_t _pull_dense_fail_times = 0; - - std::vector _base_norm_param; - std::vector _mean; - std::vector _scale; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; float _squared_sum_epsilon = 1e-4; std::mutex _mutex_for_mean_scale; - + float _total_batch_num = 0; }; #endif class ExecutorThreadWorker { public: -ExecutorThreadWorker() - : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} + ExecutorThreadWorker() + : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} virtual ~ExecutorThreadWorker() {} - + void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); void SetThreadId(int tid); @@ -161,10 +160,8 @@ ExecutorThreadWorker() #ifdef PADDLE_WITH_PSLIB virtual void SetPSlibPtr( std::shared_ptr pslib_ptr) {} - virtual void SetPullDenseThread( - std::shared_ptr dpt) {} - virtual void SetParamConfig( - AsyncWorkerParamConfig * param_config) {} + virtual void SetPullDenseThread(std::shared_ptr dpt) {} + virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {} #endif private: @@ -195,7 +192,7 @@ ExecutorThreadWorker() }; #ifdef PADDLE_WITH_PSLIB -class AsyncExecutorThreadWorker: public ExecutorThreadWorker { +class AsyncExecutorThreadWorker : public ExecutorThreadWorker { public: AsyncExecutorThreadWorker() {} virtual ~AsyncExecutorThreadWorker() {} @@ -210,40 +207,35 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker { void FillSparse(int table_id); void PushSparse(int table_id); void PushDense(int table_id); - - void check_pull_push_memory( - const std::vector& features, - std::vector& push_g, - int dim); + void check_pull_push_memory(const std::vector& features, - std::vector>& push_g, - int dim); + std::vector* push_g, int dim); + void check_pull_push_memory(const std::vector& features, + std::vector>* push_g, int dim); void collect_feasign_info(int table_id); - + private: struct FeasignInfo { uint32_t slot; uint32_t ins; int64_t label; }; - - std::map> _features; - std::map> _fea_info; + + std::map> _features; + std::map> _fea_info; std::map>> _feature_value; std::map>> _feature_push_value; - - - std::shared_ptr _pslib_ptr; - - std::shared_ptr _pull_dense_thread; - - std::vector<::std::future> _pull_sparse_status; - std::vector<::std::future> _pull_dense_status; - std::vector<::std::future> _push_sparse_status; - std::vector<::std::future> _push_dense_status; - - AsyncWorkerParamConfig* _param_config; - + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; }; #endif From 23dec787723f6906077e77b2d15820a78bde1344 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 14 Dec 2018 13:54:50 +0800 Subject: [PATCH 324/719] fix script issue test=develop --- cmake/external/snappy.cmake | 16 ++- cmake/external/xxhash.cmake | 7 +- cmake/external/zlib.cmake | 16 ++- cmake/inference_lib.cmake | 92 ++++++++------- paddle/fluid/framework/CMakeLists.txt | 36 +++--- paddle/fluid/inference/CMakeLists.txt | 9 -- .../inference/api/demo_ci/CMakeLists.txt | 106 +++++++++--------- .../detection/density_prior_box_op.cu | 3 +- .../detection/roi_perspective_transform_op.cu | 4 +- paddle/fluid/platform/CMakeLists.txt | 7 ++ paddle/fluid/pybind/CMakeLists.txt | 4 - python/setup.py.in | 3 +- 12 files changed, 157 insertions(+), 146 deletions(-) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index b30403d2d8..f9d4cd9740 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -24,12 +24,6 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) -if (WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") -else(WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") -endif (WIN32) - ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -56,6 +50,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") + add_custom_command(TARGET extern_snappy POST_BUILD + COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib + ) + ENDIF() + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") +else(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +endif (WIN32) add_library(snappy STATIC IMPORTED GLOBAL) set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index 4c2d64f627..c3e1212d8f 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -56,7 +56,12 @@ else() endif() if (WIN32) - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") + IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") + add_custom_command(TARGET extern_xxhash POST_BUILD + COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib + ) + ENDIF() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") else() set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") endif () diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c3d7323545..d350737537 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -19,12 +19,6 @@ SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE) SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE) -IF(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) -ELSE(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) -ENDIF(WIN32) - INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers. INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h. @@ -49,6 +43,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib") + add_custom_command(TARGET extern_zlib POST_BUILD + COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib + ) + ENDIF() + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE) +ELSE(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) +ENDIF(WIN32) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c679d8507d..5aa7a8a752 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -32,24 +32,35 @@ function(copy TARGET) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) if (WIN32) - # windows cmd shell will not expand wildcard automatically. - # below expand the files,libs and copy them by rules. - file(GLOB header_files ${src} "*.h") - file(GLOB static_lib_files ${src} "*.lib") - file(GLOB dll_lib_files ${src} "*.dll") - set(src_files ${header_files} ${static_lib_files} ${dll_lib_files}) - - if (NOT "${src_files}" STREQUAL "") - list(REMOVE_DUPLICATES src_files) - endif () - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" - ) - foreach (src_file ${src_files}) + if(IS_DIRECTORY ${src}) + get_filename_component(last_path ${src} NAME) + string(APPEND dst "/" ${last_path}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + if(EXISTS ${src}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND cmake -E copy_directory "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + else() + message(WARNING "${src} not exist!") + endif() + else() + # windows cmd shell will not expand wildcard automatically. + # below expand the files, and copy them by rules. + file(GLOB src_files ${src}) + if (NOT "${src_files}" STREQUAL "") + list(REMOVE_DUPLICATES src_files) + endif () add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" - COMMENT "copying ${src_file} -> ${dst}") - endforeach () + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + foreach (src_file ${src_files}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" + COMMENT "copying ${src_file} -> ${dst}") + endforeach () + endif() else (WIN32) # not windows add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}" @@ -95,7 +106,7 @@ copy(xxhash_lib DEPS xxhash ) -if (NOT PROTOBUF_FOUND) +if (NOT PROTOBUF_FOUND OR WIN32) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") copy(protobuf_lib SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} @@ -138,27 +149,25 @@ if (WITH_NGRAPH) ) endif () -if (NOT WIN32) - if (NOT MOBILE_INFERENCE AND NOT RPI) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") - copy(snappy_lib - SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappy) +if (NOT MOBILE_INFERENCE AND NOT RPI) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") + copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappy) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") - copy(snappystream_lib - SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappystream) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") + copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappystream) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") - copy(zlib_lib - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS zlib) - endif () -endif (NOT WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") + copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS zlib) +endif () # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") @@ -192,8 +201,13 @@ if (WITH_ANAKIN AND WITH_MKL) endif () set(module "inference") +if(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*) +else(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) +endif(WIN32) copy(inference_lib DEPS ${inference_deps} - SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib} ${src_dir}/${module}/api/paddle_*.h DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) @@ -233,7 +247,7 @@ copy(third_party DEPS fluid_lib_dist # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library copy(inference_api_lib DEPS fluid_lib_dist - SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${paddle_fluid_lib} ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d7a69c8c9..eb4f1a816f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -3,30 +3,22 @@ # We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs SRCS PATH) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") - endif() - - # only copy the xx.cu to .xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT .${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 058a5b5f46..b80e7ef752 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -26,9 +26,6 @@ endif(WIN32) # paddle_fluid_origin exclude inference api interface if(WIN32) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) endif(WIN32) @@ -44,9 +41,6 @@ set(SHARED_INFERENCE_SRCS if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) @@ -63,9 +57,6 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) target_link_libraries(paddle_fluid_shared shlwapi) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index ec93729cd2..8d0d96d391 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -15,12 +15,43 @@ macro(safe_set_static_flag) endforeach(flag_var) endmacro() +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +include_directories("${PADDLE_LIB}/") +include_directories("${PADDLE_LIB}/fluid_inference_install_dir/") +include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") +include_directories("${PADDLE_LIB}/third_party/install/glog/include") +include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") +include_directories("${PADDLE_LIB}/third_party/install/snappy/include") +include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") +include_directories("${PADDLE_LIB}/third_party/install/zlib/include") +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") +link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") +link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") +link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") +link_directories("${PADDLE_LIB}/third_party/install/glog/lib") +link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") +link_directories("${PADDLE_LIB}/paddle/lib") + if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") if (WITH_STATIC_LIB) safe_set_static_flag() add_definitions(-DSTATIC_LIB) - set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w") - set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w") endif() set(CMAKE_STATIC_LIBRARY_PREFIX "lib") else() @@ -29,36 +60,15 @@ else() endif() message("flags" ${CMAKE_CXX_FLAGS}) -if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") -endif() -if(NOT DEFINED DEMO_NAME) - message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") -endif() - - if(WITH_GPU) if(NOT WIN32) set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") else() if(CUDA_LIB STREQUAL "") - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") endif() endif(NOT WIN32) endif() -include_directories("${PADDLE_LIB}") -include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") -include_directories("${PADDLE_LIB}/third_party/install/glog/include") -include_directories("${PADDLE_LIB}/third_party/install/gflags/include") -include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") -if (NOT WIN32) -include_directories("${PADDLE_LIB}/third_party/install/snappy/include") -include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") -include_directories("${PADDLE_LIB}/third_party/install/zlib/include") -endif(NOT WIN32) - -include_directories("${PADDLE_LIB}/third_party/boost") -include_directories("${PADDLE_LIB}/third_party/eigen3") if (NOT WIN32) if (USE_TENSORRT AND WITH_GPU) @@ -67,18 +77,6 @@ if (NOT WIN32) endif() endif(NOT WIN32) -if (NOT WIN32) -link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") -link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") -link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") -endif(NOT WIN32) - -link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") -link_directories("${PADDLE_LIB}/third_party/install/glog/lib") -link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") -link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") -link_directories("${PADDLE_LIB}/paddle/lib") - if (NOT WIN32) set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") if(EXISTS ${NGRAPH_PATH}) @@ -89,8 +87,6 @@ if (NOT WIN32) endif() endif() -add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) - if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} @@ -106,26 +102,25 @@ endif() # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a if(WITH_STATIC_LIB) - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() if (NOT WIN32) -set(EXTERNAL_LIB "-lrt -ldl -lpthread") -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} - glog gflags protobuf snappystream snappy z xxhash - ${EXTERNAL_LIB}) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} + glog gflags protobuf snappystream snappy z xxhash + ${EXTERNAL_LIB}) else() -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf - ${EXTERNAL_LIB}) -# NOTE(dzhwinter) shlwapi is deprecated. -set(DEPS ${DEPS} libcmt shlwapi) + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf + ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash + snappystream ${EXTERNAL_LIB}) + # NOTE(dzhwinter) shlwapi is deprecated. + set(DEPS ${DEPS} libcmt shlwapi) endif(NOT WIN32) if(WITH_GPU) @@ -137,9 +132,10 @@ if(WITH_GPU) set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) endif() endif() +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 3b7c781795..6a92762896 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -146,7 +146,8 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { // At least use 32 threads, at most 512 threads. // blockx is multiple of 32. - int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int blockx = std::min( + static_cast(((feature_width * num_priors + 31) >> 5) << 5), 512L); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 2d262f932a..862d664d42 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -35,12 +35,12 @@ namespace operators { template __device__ bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; + return (a > b) || Eigen::numext::abs(a - b) < 1e-4; } template __device__ bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; + return (a < b) || Eigen::numext::abs(a - b) < 1e-4; } template diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 23c7ebe842..2f205e1d5c 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -67,6 +67,13 @@ ENDIF() # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) +if(WIN32) + if(WITH_GPU AND NOT WITH_DSO) + get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) + target_link_libraries(device_context ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) +endif(WIN32) + nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) cc_test(init_test SRCS init_test.cc DEPS device_context) diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8954cb126..c79d5d9403 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -19,10 +19,6 @@ if(WITH_PYTHON) endif(WITH_AMD_GPU) if(WIN32) - if(WITH_GPU AND NOT WITH_DSO) - get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) - target_link_libraries(paddle_pybind ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) target_link_libraries(paddle_pybind shlwapi) endif(WIN32) diff --git a/python/setup.py.in b/python/setup.py.in index 0eb69cdb5c..6562046641 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -241,5 +241,6 @@ setup(name='${PACKAGE_NAME}', ext_modules=ext_modules, package_data=package_data, package_dir=package_dir, - scripts=paddle_bins + scripts=paddle_bins, + distclass=BinaryDistribution ) From 58110921bd84af43ed08a955c515aadd1558bac3 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 14:16:43 +0800 Subject: [PATCH 325/719] fix CMakeList bug --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ab237f768a..3575080c99 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -138,7 +138,7 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) #Generate an empty \ - __init__.py to make framework_py_proto as a valid python module. + #__init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) From c550e0ce0680b03a7625134eaa991083deaafc41 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 14:27:57 +0800 Subject: [PATCH 326/719] Add python interface for huber regression loss test=develop --- paddle/fluid/operators/huber_loss_op.cc | 7 +-- python/paddle/fluid/layers/nn.py | 69 +++++++++++++++++++++---- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 4ecd8634ff..253b65a5f3 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp); REGISTER_OP_CPU_KERNEL( - huber_loss, - ops::HuberLossKernel); + huber_loss, ops::HuberLossKernel, + ops::HuberLossKernel); REGISTER_OP_CPU_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel, + ops::HuberLossGradKernel); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4df74edfce..fb1ae7b753 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,6 +169,7 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', + 'huber_regression_loss', ] @@ -4595,7 +4596,7 @@ def hsigmoid(input, """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a - complete binary tree, or you can use is_custom to pass your own tree to + complete binary tree, or you can use is_custom to pass your own tree to implement hierarchical. Each leaf node represents a class(a word) and each internal node acts as a binary classifier. For each word there's a unique path from root to it's leaf node, hsigmoid calculate the cost for each @@ -4611,7 +4612,7 @@ def hsigmoid(input, 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code means label of each binary classification, using 1 indicate true, 0 indicate false. - 4. now, each word should has its path and code along the path, you can pass a batch of path and code + 4. now, each word should has its path and code along the path, you can pass a batch of path and code related to the same batch of inputs. @@ -4621,8 +4622,8 @@ def hsigmoid(input, and :math:`D` is the feature size. label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. - num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, - it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num + num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, + it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num which indicates the num of classes using by binary classify. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid @@ -4635,15 +4636,15 @@ def hsigmoid(input, is not set, the bias is initialized zero. Default: None. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. - path_table: (Variable|None) this variable can store each batch of samples' path to root, + path_table: (Variable|None) this variable can store each batch of samples' path to root, it should be in leaf -> root order - path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like - structure and each element in this array is indexes in parent nodes' Weight Matrix. - path_code: (Variable|None) this variable can store each batch of samples' code, + path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like + structure and each element in this array is indexes in parent nodes' Weight Matrix. + path_code: (Variable|None) this variable can store each batch of samples' code, each code consist with every code of parent nodes. it should be in leaf -> root order - is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is + is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is set you need to set path_table/path_code/num_classes, otherwise num_classes should be set - is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient + is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient of W and input will be sparse. Returns: @@ -8770,3 +8771,51 @@ def bilinear_tensor_product(x, # add activation return helper.append_activation(out) + + +def huber_regression_loss(input, label, delta): + """ + Huber regression loss is a loss function used in robust regression. + Huber regression loss can evaluate the fitness of input to label. + Different from MSE loss, Huber regression loss is more robust for outliers. + + When the difference between input and label is large than delta + .. math:: + + huber\_regression\_loss = delta * (label - input) - 0.5 * delta * delta + + When the difference between input and label is less than delta + .. math:: + + huber\_regression\_loss = 0.5 * (label - input) * (label - input) + + + Args: + input (Variable): This input is a probability computed by the previous operator. + The first dimension is batch size, and the last dimension is 1. + label (Variable): The groud truth whose first dimension is batch size + and last dimension is 1. + delta (float): The parameter of huber regression loss, which controls + the range of outliers + + Returns: + huber\_regression\_loss (Variable): The huber regression loss with shape [batch_size, 1]. + + Examples: + .. code-block:: python + + predictions = fluid.layers.softmax(x) + loss = fluid.layers.huber_regression_loss(input=predictions, label=label, 1.0) + """ + helper = LayerHelper('huber_regression_loss', **locals()) + residual = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + helper.append_op( + type='huber_loss', + inputs={'X': input, + 'Y': label}, + outputs={'Out': out, + 'Residual': residual}, + attrs={'delta': delta}) + return out From 5c7ad1ecb1971fad268dce0040d6e9d597dcef31 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 14:33:49 +0800 Subject: [PATCH 327/719] Resolve conflicts test=develop --- python/paddle/fluid/layers/nn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bc8e3e8a3c..28cb700f82 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9179,6 +9179,7 @@ def psroi_pool(input, }) return out + def huber_regression_loss(input, label, delta): """ Huber regression loss is a loss function used in robust regression. From 09d669ba40aa900920dea84eb07aa868c44831b0 Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Fri, 14 Dec 2018 14:16:43 +0800 Subject: [PATCH 328/719] fix static_cast to const_cast --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/async_executor.cc | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ab237f768a..3575080c99 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -138,7 +138,7 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) #Generate an empty \ - __init__.py to make framework_py_proto as a valid python module. + #__init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index e2756cafa2..ee3c5e01f8 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -81,9 +81,8 @@ void AsyncExecutor::InitWorker(const std::string& dist_desc, int node_num, int index) { _pslib_ptr = std::shared_ptr( new paddle::distributed::PSlib()); - _pslib_ptr->init_worker(dist_desc, - static_cast(host_sign_list.data()), - node_num, index); + _pslib_ptr->init_worker( + dist_desc, const_cast(host_sign_list.data()), node_num, index); InitParamConfig(); } @@ -94,7 +93,7 @@ void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } void AsyncExecutor::GatherServers(const std::vector& host_sign_list, int node_num) { - _pslib_ptr->gather_servers(static_cast(host_sign_list.data()), + _pslib_ptr->gather_servers(const_cast(host_sign_list.data()), node_num); } From 04a570b4634f3cab2815cd1688df192f0f5b1d81 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 14 Dec 2018 14:59:35 +0800 Subject: [PATCH 329/719] Fix ut test=develop --- paddle/fluid/framework/data_type_test.cc | 2 +- paddle/fluid/framework/op_kernel_type_test.cc | 3 ++- paddle/fluid/inference/api/api_impl_tester.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 92639dfc61..2a380201f2 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -35,6 +35,6 @@ TEST(DataType, float16) { EXPECT_EQ(f::SizeOfType(dtype), 2u); // test debug info - std::string type = "float16"; + std::string type = "::paddle::platform::float16"; EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); } diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc index 3e17a512ce..40db85400d 100644 --- a/paddle/fluid/framework/op_kernel_type_test.cc +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) { OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW, LibraryType::kCUDNN); ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), - "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_" + "data_type[::paddle::platform::float16]:data_layout[NCHW]:place[" + "CUDAPlace(0)]:library_" "type[CUDNN]"); } diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 191225493c..7839639739 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -39,7 +39,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { if (t->type() == framework::proto::VarType::INT64) { pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; - } else if (t->type() == framework::proto::VarType::INT32) { + } else if (t->type() == framework::proto::VarType::FP32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { From ab98101c2e3a71dd88aa9f8c9439f65293de2543 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 15:09:39 +0800 Subject: [PATCH 330/719] Update API.spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8e6482ca98..c152a506d9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -199,6 +199,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.huber_regression_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) From f702ab74b9edfe6310470ad1ad98ae054f3120fc Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 14 Dec 2018 07:36:45 +0000 Subject: [PATCH 331/719] add dist transpiler test --- .../tests/unittests/test_dist_transpiler.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 650a745cdc..27575897b5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -875,5 +875,53 @@ class TestRemoteNce(TestDistLookupTableBase): pass +# test for remote prefetch +class TestRemoteHsigmoid(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 10 + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + path_table = fluid.layers.data( + name='path_table', shape=[10], dtype='int64') + path_code = fluid.layers.data( + name='path_code', shape=[10], dtype='int64') + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='hs_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='hs_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.hsigmoid( + input=input, + label=label, + num_classes=non_leaf_num, + path_table=path_table, + path_code=path_code, + is_custom=True, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + for op in trainer.blocks[0].ops: + if op.type == "recv": + pass + + if __name__ == "__main__": unittest.main() From 8e785fec8d3d877db141f49cf95f557837b875d0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 13 Dec 2018 16:01:51 +0000 Subject: [PATCH 332/719] clean code and refine tests template --- paddle/fluid/operators/jit/test.cc | 620 +++++++++++++---------------- 1 file changed, 276 insertions(+), 344 deletions(-) diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index d994a11f97..62d4cdc19a 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -57,31 +57,188 @@ std::vector TestSizes() { return s; } -template -void TestXYZNFunc(const typename KernelTuples::func_type tgt, +namespace jit = paddle::operators::jit; + +template +struct TestFuncWithRefer { + void operator()(const typename KernelTuples::func_type tgt, Args... args) {} +}; + +template +struct TestFuncWithRefer, std::vector, std::vector, + std::vector> { + void operator()(const typename jit::XYZNTuples::func_type tgt, const std::vector& x, const std::vector& y, const std::vector& zref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(zref.size(), x.size()); - EXPECT_EQ(zref.size(), y.size()); - const T* x_data = x.data(); - const T* y_data = y.data(); - const T* zref_data = zref.data(); - const int d = zref.size(); - - std::vector ztgt(d); - T* ztgt_data = ztgt.data(); - // test normal - tgt(x_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace y - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + } +}; + +template +struct TestFuncWithRefer, T, std::vector, + std::vector> { + void operator()(const typename jit::AXYNTuples::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + } +}; + +template +struct TestFuncWithRefer, std::vector, std::vector> { + void operator()(const typename jit::XYNTuples::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + } +}; + +template +struct TestFuncWithRefer, std::vector, std::vector, + std::vector, std::vector, std::vector> { + void operator()(const typename jit::LSTMTuples::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const typename jit::LSTMTuples::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + paddle::operators::jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); + } +}; + +template +struct TestFuncWithRefer, std::vector, std::vector, + std::vector> { + void operator()(const typename jit::GRUTuples::func_type tgt, + const std::vector& xsrc, const std::vector& ht_1, + const std::vector& ht_ref, + const typename jit::GRUTuples::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + paddle::operators::jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); + } +}; + +template +void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { + TestFuncWithRefer test; + // test jitcode + auto jitcode = jit::GetJitCode(attr); + if (jitcode) { + VLOG(10) << "Test Jitcode Kernel "; + test(jitcode, args...); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + VLOG(10) << "Test More Kernel "; + test(more, args...); + } + } + } + // test result from Get function + VLOG(10) << "Test Get function "; + auto tgt = jit::Get(attr); + test(tgt, args...); } template @@ -113,79 +270,11 @@ void TestXYZNKernel() { ExpectEQ(xinp_data, zref_data, d); ExpectEQ(yinp_data, zref_data, d); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestXYZNFunc>(jitcode, x, y, zref); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel, size: " << d; - TestXYZNFunc>(more, x, y, zref); - } - } - } - // Test result from Get function - VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestXYZNFunc>(tgt, x, y, zref); + TestAllImpls, PlaceType, std::vector, + std::vector, std::vector>(d, x, y, zref); } } -TEST(JITKernel, vmul) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, vadd) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, vaddrelu) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, vsub) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); -} - -template -void TestAXYNFunc(const typename KernelTuples::func_type tgt, const T a, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); -} - template void TestAXYNKernel() { namespace jit = paddle::operators::jit; @@ -208,67 +297,11 @@ void TestAXYNKernel() { ref(&a, xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestAXYNFunc>(jitcode, a, x, yref); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel, size: " << d; - TestAXYNFunc>(more, a, x, yref); - } - } - } - // Test result from Get function - VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestAXYNFunc>(tgt, a, x, yref); + TestAllImpls, PlaceType, T, std::vector, + std::vector>(d, a, x, yref); } } -TEST(JITKernel, vscal) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); -} - -TEST(JITKernel, vaddbias) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); -} - -template -void TestXYNFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); -} - template void TestXYNKernel() { namespace jit = paddle::operators::jit; @@ -290,108 +323,11 @@ void TestXYNKernel() { ref(xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel, size: " << d; - TestXYNFunc>(jitcode, x, yref); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>(impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel, size: " << d; - TestXYNFunc>(more, x, yref); - } - } - } - // Test result from Get function - VLOG(10) << "Test Get function, size: " << d; - auto tgt = jit::Get, PlaceType>(d); - TestXYNFunc>(tgt, x, yref); + TestAllImpls, PlaceType, std::vector, + std::vector>(d, x, yref); } } -TEST(JITKernel, vrelu) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, videntity) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, vexp) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, vsigmoid) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -TEST(JITKernel, vtanh) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); -} - -template -void TestLSTMFunc(const typename KernelTuples::func_type tgt, - const std::vector& xsrc, const std::vector& wp, - const std::vector& ct_1, const std::vector& ct_ref, - const std::vector& ht_ref, - const paddle::operators::jit::lstm_attr_t& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - paddle::operators::jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); -} - template void TestLSTMKernel() { namespace jit = paddle::operators::jit; @@ -435,37 +371,10 @@ void TestLSTMKernel() { } ref(&step, &attr); - // test jitcode - auto jitcode = - jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel " << info; - TestLSTMFunc>(jitcode, xsrc, wp, ct_1, - ct_ref, ht_ref, attr); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel " << info; - TestLSTMFunc>(more, xsrc, wp, ct_1, - ct_ref, ht_ref, attr); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - TestLSTMFunc>(tgt, xsrc, wp, ct_1, ct_ref, - ht_ref, attr); + TestAllImpls, PlaceType, std::vector, + std::vector, std::vector, std::vector, + std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, + attr); } } } @@ -473,43 +382,6 @@ void TestLSTMKernel() { } } -TEST(JITKernel, lstmctht) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); -} - -TEST(JITKernel, lstmc1h1) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); -} - -template -void TestGRUFunc(const typename KernelTuples::func_type tgt, - const std::vector& xsrc, const std::vector& ht_1, - const std::vector& ht_ref, - const paddle::operators::jit::gru_attr_t& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ht_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ht(ht_ref.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); - paddle::operators::jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - tgt(&step, &attr); - ExpectEQ(ht_data, ht_ref_data, d); -} - template void TestGRUKernel() { namespace jit = paddle::operators::jit; @@ -538,37 +410,97 @@ void TestGRUKernel() { step.ht = ht_ref_data; ref(&step, &attr); - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel " << info; - TestGRUFunc>(jitcode, xsrc, ht_1, ht_ref, attr); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel " << info; - TestGRUFunc>(more, xsrc, ht_1, ht_ref, attr); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - TestGRUFunc>(tgt, xsrc, ht_1, ht_ref, attr); + TestAllImpls, PlaceType, std::vector, + std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, + attr); } } } } +// XYZNTuple +TEST(JITKernel, vmul) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vadd) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vaddrelu) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +TEST(JITKernel, vsub) { + namespace jit = paddle::operators::jit; + TestXYZNKernel(); + TestXYZNKernel(); +} + +// AXYNTuples +TEST(JITKernel, vscal) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +TEST(JITKernel, vaddbias) { + namespace jit = paddle::operators::jit; + TestAXYNKernel(); + TestAXYNKernel(); +} + +// XYNTuples +TEST(JITKernel, vrelu) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, videntity) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vexp) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vsigmoid) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +TEST(JITKernel, vtanh) { + namespace jit = paddle::operators::jit; + TestXYNKernel(); + TestXYNKernel(); +} + +// LSTM +TEST(JITKernel, lstmctht) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +TEST(JITKernel, lstmc1h1) { + namespace jit = paddle::operators::jit; + TestLSTMKernel(); + TestLSTMKernel(); +} + +// GRU TEST(JITKernel, gruh1) { namespace jit = paddle::operators::jit; TestGRUKernel(); From 4a4ccac1d060ccf5758b7ff0d32dfb90ab3c5b7f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 14 Dec 2018 15:53:13 +0800 Subject: [PATCH 333/719] update by comment test=develop --- .../framework/details/all_reduce_op_handle.cc | 14 ++++++-------- .../framework/details/multi_devices_graph_pass.cc | 4 ++-- paddle/fluid/framework/details/op_handle_base.cc | 1 + .../details/threaded_ssa_graph_executor.cc | 1 + paddle/fluid/framework/parallel_executor.cc | 14 ++++++++++---- paddle/fluid/framework/threadpool.h | 1 + .../reader/create_double_buffer_reader_op.cc | 1 + paddle/fluid/platform/nccl_helper.h | 5 +---- .../unittests/test_parallel_executor_mnist.py | 2 +- 9 files changed, 24 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6b7bbf9003..5a4f218077 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); - if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream); + // TODO(Yancey1989): synchronize here can get better performance + // if don't use NCCL group call, but need more profileing. + if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream); }); } this->RunAndRecordEvent([&] { - // TODO(Yancey1989): need allreduce operator to avoid this flag - if (nccl_ctxs_->need_group_call_) { + if (all_reduce_calls.size() == 1UL) { + all_reduce_calls[0](); + } else { platform::NCCLGroupGuard guard; for (auto &call : all_reduce_calls) { call(); } - } else { - // only used in executor_type == ParallalGraph, one thread one GPU - // TODO(Yancey1989): use allreduce operator to avoid this tricky. - PADDLE_ENFORCE(all_reduce_calls.size() == 1UL); - all_reduce_calls[0](); } }); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 6e8cf86fcc..5b82805ad9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -386,8 +386,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } -// insert synchronous ops at the backpropagation; and -// insert synchronous ops if the graph contains mutilple places. +// insert collective ops at the backpropagation; and +// insert collective ops if the graph contains mutilple places. #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (!is_forwarding && diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4914e0a5ad..4822627ac3 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) { #else PADDLE_ENFORCE(!use_cuda); #endif + RunImpl(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index cebf63364d..677a293794 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp( if (LIKELY(!strategy_.dry_run_)) { op->Run(strategy_.use_cuda_); } + VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2604e41045..63f3ef0eac 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - bool need_group_call = true; if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. @@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor( } else { nccl_id = nccl_id_var->GetMutable(); } - need_group_call = false; } else if (nccl_id_var != nullptr) { // the other executor type. // the distributed training with nccl mode would initialize the nccl id in // startup_program. nccl_id = nccl_id_var->GetMutable(); } else { - // initlize NCCL by ncclCommInitAll, do not need nccl_id. + // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id. } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); + member_->places_, nccl_id, num_trainers, trainer_id)); #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor( graphs.push_back(std::move(graph)); #endif + auto max_memory_size = GetEagerDeletionThreshold(); + // TODO(Yancey1989): fix gc failed on ParallelGraph executor. + if (max_memory_size >= 0 && + exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { + graphs[0] = member_->PrepareGCAndRefCnts( + std::move(graphs[0]), static_cast(max_memory_size)); + } + // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 8fd834be9a..7a51d18fbb 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { + struct ExceptionHandler { mutable std::future> future_; explicit ExceptionHandler( diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 440b16cf91..ed719f91d0 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { sin >> num; place = platform::CUDAPlace(static_cast(num)); } + out->Reset(framework::MakeDecoratedReader(underlying_reader, place, 2)); } diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 23a0222239..8d062dcdb4 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -82,15 +82,12 @@ struct NCCLContext { struct NCCLContextMap { std::unordered_map contexts_; std::vector order_; - bool need_group_call_; explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, - size_t num_trainers = 1, size_t trainer_id = 0, - bool need_group_call = true) { + size_t num_trainers = 1, size_t trainer_id = 0) { PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); - need_group_call_ = need_group_call; for (auto &p : places) { int dev_id = boost::get(p).device; order_.emplace_back(dev_id); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 0ff079b4e2..fffe8bee58 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase): self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reducea + # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) From e90b2f104cbf4277e3cc55171e715e91f2512251 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 14 Dec 2018 16:07:00 +0800 Subject: [PATCH 334/719] In most times, const_cast is bad and break interface contract and make the code unreadable and make the program unstable. test=develop --- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 2 ++ paddle/scripts/paddle_build.sh | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index dd64cc327f..f2ba75485c 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { } CudnnRNNCache *cudnn_rnn_cache = nullptr; if (cache_var->IsInitialized()) { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); } else { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); std::random_device rnd; diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 6299b166af..a1c1886c7f 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -517,6 +517,18 @@ function assert_api_spec_approvals() { fi fi done + + HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast` + if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ + python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "You must have at least 2 approvals for the const_cast" + exit 1 + fi + fi + } From cf5264629f914724f91e0a364adca4728b8dcc96 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 14 Dec 2018 15:20:07 +0800 Subject: [PATCH 335/719] update API.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8e6482ca98..cfa28948e9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -365,7 +365,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) From 77907a35028053f68f95467650ac7878e58cc5aa Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 07:43:39 +0000 Subject: [PATCH 336/719] refine benchmark template --- paddle/fluid/operators/jit/benchmark.cc | 389 +++++------------------- paddle/fluid/operators/jit/helper.h | 14 + 2 files changed, 85 insertions(+), 318 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index ca636b020c..4e5d530251 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -51,251 +51,108 @@ std::vector TestSizes() { return s; } -// return this function avg time -template -double BenchXYZNFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, const std::vector& y, - std::vector& z) { // NOLINT - const T* x_data = x.data(); - const T* y_data = y.data(); - const int d = z.size(); - T* z_data = z.data(); - - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(x_data, y_data, z_data, d); +template +struct BenchFunc { + // return this function avg time + double operator()(const typename KernelTuples::func_type tgt, Args... args) { + for (int i = 0; i < FLAGS_burning; ++i) { + tgt(args...); + } + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeat; ++i) { + tgt(args...); + } + auto end = GetCurrentUS(); + return (end - start) / FLAGS_repeat; + } +}; + +namespace jit = paddle::operators::jit; + +template +void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { + BenchFunc benchmark; + std::vector> infos; + // test refer + auto refer = jit::GetRefer(); + if (!refer) { + LOG(FATAL) << "Refer can not be empty!"; + } + infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); + + // test jitcode + auto jitcode = jit::GetJitCode(attr); + if (jitcode) { + infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); + } + // test all impls in more + jit::KernelKey kkey(KT, PlaceType()); + auto& pool = jit::KernelPool().Instance().AllKernels(); + auto iter = pool.find(kkey); + if (iter != pool.end()) { + auto& impls = iter->second; + for (auto& impl : impls) { + auto i = dynamic_cast*>(impl.get()); + if (i && i->UseMe(attr)) { + auto more = i->GetFunc(); + infos.push_back(std::make_pair("More", benchmark(more, args...))); + } + } } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(x_data, y_data, z_data, d); + // Test result from Get function + auto tgt = jit::Get(attr); + if (!tgt) { + LOG(FATAL) << "Target can not be empty!"; } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; + infos.push_back(std::make_pair("Target", benchmark(tgt, args...))); + + // print + std::ostringstream loginfos; + loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": "; + for (auto pair : infos) { + loginfos << pair.first << " takes " << pair.second << " us; "; + } + LOG(INFO) << loginfos.str(); } template void BenchXYZNKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { - std::vector> infos; std::vector x(d), y(d), z(d); RandomVec(d, x.data()); RandomVec(d, y.data()); - // refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchXYZNFunc>(refer, x, y, z); - infos.push_back(std::make_pair("Refer", res)); - } - - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - auto res = BenchXYZNFunc>(jitcode, x, y, z); - infos.push_back(std::make_pair("JitCode", res)); - } - - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - auto res = BenchXYZNFunc>(more, x, y, z); - infos.push_back(std::make_pair("More", res)); - } - } - } - - // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchXYZNFunc>(tgt, x, y, z); - infos.push_back(std::make_pair("Target", res)); - - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(d, x.data(), y.data(), + z.data(), d); } } -// return this function avg time -template -double BenchAXYNFunc(const typename KernelTuples::func_type tgt, const T a, - const std::vector& x, - std::vector& y) { // NOLINT - const T* x_data = x.data(); - T* y_data = y.data(); - const int d = y.size(); - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(&a, x_data, y_data, d); - } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(&a, x_data, y_data, d); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; -} - template void BenchAXYNKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { - std::vector> infos; const T a = static_cast(3); std::vector x(d), y(d); RandomVec(d, x.data()); - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchAXYNFunc>(refer, a, x, y); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - auto res = BenchAXYNFunc>(jitcode, a, x, y); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - auto res = BenchAXYNFunc>(more, a, x, y); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchAXYNFunc>(tgt, a, x, y); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); - } -} - -// return this function avg time -template -double BenchXYNFunc(const typename KernelTuples::func_type tgt, - const std::vector& x, - std::vector& y) { // NOLINT - const T* x_data = x.data(); - T* y_data = y.data(); - const int d = y.size(); - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(x_data, y_data, d); + BenchAllImpls, PlaceType>(d, &a, x.data(), y.data(), + d); } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(x_data, y_data, d); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; } template void BenchXYNKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { - std::vector> infos; std::vector x(d), y(d); RandomVec(d, x.data()); - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchXYNFunc>(refer, x, y); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(d); - if (jitcode) { - auto res = BenchXYNFunc>(jitcode, x, y); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>(impl.get()); - if (i && i->UseMe(d)) { - auto more = i->GetFunc(); - auto res = BenchXYNFunc>(more, x, y); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(d); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchXYNFunc>(tgt, x, y); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(d, x.data(), y.data(), d); } } -// return this function avg time -template -double BenchLSTMFunc(const typename KernelTuples::func_type tgt, - const paddle::operators::jit::lstm_attr_t* attr, - paddle::operators::jit::lstm_t* step) { - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(step, attr); - } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(step, attr); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; -} - template void BenchLSTMKernel() { - namespace jit = paddle::operators::jit; for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::vsigmoid, jit::vtanh, jit::vtanh, use_peephole); - std::vector> infos; std::vector x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); RandomVec(4 * d, x.data(), -2.f, 2.f); RandomVec(3 * d, wp.data(), -2.f, 2.f); @@ -315,77 +172,15 @@ void BenchLSTMKernel() { step.wp = wp_data; step.checked = checked_data; } - - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchLSTMFunc>(refer, &attr, &step); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - auto res = BenchLSTMFunc>(jitcode, &attr, &step); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast>*>( - impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - auto res = BenchLSTMFunc>(more, &attr, &step); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchLSTMFunc>(tgt, &attr, &step); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) - << ", Sigmoid,Tanh,Tanh, " << (use_peephole ? "Peephole_" : "") - << " size " << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(attr, &step, &attr); } } } -// return this function avg time -template -double BenchGRUFunc(const typename KernelTuples::func_type tgt, - const paddle::operators::jit::gru_attr_t* attr, - paddle::operators::jit::gru_t* step) { - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(step, attr); - } - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(step, attr); - } - auto end = GetCurrentUS(); - return (end - start) / FLAGS_repeat; -} - template void BenchGRUKernel() { - namespace jit = paddle::operators::jit; for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::vsigmoid, jit::vtanh); - std::vector> infos; std::vector x(3 * d), ht_1(d), ht(d); RandomVec(3 * d, x.data(), -2.f, 2.f); RandomVec(d, ht_1.data(), -2.f, 2.f); @@ -396,50 +191,7 @@ void BenchGRUKernel() { step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; - - // test refer - auto refer = jit::GetRefer>(); - if (refer) { - auto res = BenchGRUFunc>(refer, &attr, &step); - infos.push_back(std::make_pair("Refer", res)); - } - // test jitcode - auto jitcode = jit::GetJitCode, PlaceType>(attr); - if (jitcode) { - auto res = BenchGRUFunc>(jitcode, &attr, &step); - infos.push_back(std::make_pair("JitCode", res)); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = - dynamic_cast>*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - auto res = BenchGRUFunc>(more, &attr, &step); - infos.push_back(std::make_pair("More", res)); - } - } - } - // Test result from Get function - auto tgt = jit::Get, PlaceType>(attr); - if (!tgt) { - LOG(ERROR) << "Target can not be empty!"; - } - auto res = BenchGRUFunc>(tgt, &attr, &step); - infos.push_back(std::make_pair("Target", res)); - // print - std::ostringstream loginfos; - loginfos << "Kernel Type: " << jit::to_string(KT) << ", Sigmoid,Tanh, size " - << d << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); + BenchAllImpls, PlaceType>(attr, &step, &attr); } } @@ -456,16 +208,17 @@ int main(int argc, char* argv[]) { << " times."; using T = float; using PlaceType = paddle::platform::CPUPlace; - namespace jit = paddle::operators::jit; + // xyzn BenchXYZNKernel(); BenchXYZNKernel(); BenchXYZNKernel(); BenchXYZNKernel(); + // axyn BenchAXYNKernel(); BenchAXYNKernel(); - // act + // xyn BenchXYNKernel(); BenchXYNKernel(); BenchXYNKernel(); diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 302e70caa7..3431c22111 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include "paddle/fluid/operators/jit/gen_base.h" @@ -124,6 +125,19 @@ const char* to_string(KernelType kt); KernelType to_kerneltype(const std::string& act); +inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { + os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) + << "],act_cand[" << to_string(attr.act_cand) << "],act_cell[" + << to_string(attr.act_cell) << "],use_peephole[" + << (attr.use_peephole ? "True" : "False") << "]"; + return os; +} +inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { + os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) + << "],act_cand[" << to_string(attr.act_cand) << "]"; + return os; +} + } // namespace jit } // namespace operators } // namespace paddle From 723f68727db273902674e6046ead5f0ebdb78bf4 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 14 Dec 2018 17:00:48 +0800 Subject: [PATCH 337/719] add ut about nce in transpiler --- .../fluid/tests/unittests/test_dist_transpiler.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 650a745cdc..8abd7d9e0c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -870,9 +870,21 @@ class TestRemoteNce(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() + + out_vars = ["nce_w.block0", "nce_w.block1"] + in_vars = ["nce_b.block0", "nce_b.block1"] + + recv_var_names = [] + for op in trainer.blocks[0].ops: if op.type == "recv": - pass + for var in op.output("Out"): + recv_var_names.append(var) + + for out_var in out_vars: + self.assertFalse(out_var in recv_var_names) + for in_var in in_vars: + self.assertTrue(in_var in recv_var_names) if __name__ == "__main__": From ae17926987f1dcb2a8e75925047a542c49b589cf Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 09:22:28 +0000 Subject: [PATCH 338/719] enable jitkernel mkl vmul, vadd and vscal --- paddle/fluid/operators/jit/README.md | 2 + .../operators/jit/more/mkl/CMakeLists.txt | 2 + paddle/fluid/operators/jit/more/mkl/mkl.cc | 68 ++++++++++++++++++- paddle/fluid/operators/jit/more/mkl/mkl.h | 31 ++++++--- .../fluid/operators/math/jit_kernel_blas.cc | 50 -------------- 5 files changed, 90 insertions(+), 63 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 6b2f2b2848..28d21f40af 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -45,6 +45,8 @@ PaddlePaddle/Paddle/paddle/fluid/ - 在`KernelType` 中添加 `your_key` . - 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. +- (optional) 实现更多的算法在`more`目录下,可以依赖mkl,openblas,或者mkldnn等第三方库。 +- (optional) 实现基于Xbyak的生成code,在`gen`目下。 - 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 - 添加unit test,需要测试float和double - 添加benchmark确保get得到的速度是最快。 diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 0c15c7060d..ffecb73297 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -4,3 +4,5 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE # use mkl kernels by name and type USE_JITKERNEL_MORE(vmul, mkl) +USE_JITKERNEL_MORE(vadd, mkl) +USE_JITKERNEL_MORE(vscal, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 0ffe1d565f..3d963cbf1d 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -13,7 +13,9 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/mkl/mkl.h" +#include "paddle/fluid/operators/jit/refer/refer.h" #include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/dynload/mklml.h" namespace paddle { @@ -32,6 +34,61 @@ void VMul(const double* x, const double* y, double* z, int n) { platform::dynload::vdMul(n, x, y, z); } +template <> +void VAdd(const float* x, const float* y, float* z, int n) { + platform::dynload::vsAdd(n, x, y, z); +} + +template <> +void VAdd(const double* x, const double* y, double* z, int n) { + platform::dynload::vdAdd(n, x, y, z); +} + +template <> +void VScal(const float* a, const float* x, float* y, int n) { + if (x == y) { + platform::dynload::cblas_sscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +template <> +void VScal(const double* a, const double* x, double* y, int n) { + if (x == y) { + platform::dynload::cblas_dscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 +template <> +bool VMulKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VAddKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VScalKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::UseMe(int d) const { \ + return true; \ + } + +AWALYS_USE_ME_WITH_DOUBLE(VMul); +AWALYS_USE_ME_WITH_DOUBLE(VAdd); +AWALYS_USE_ME_WITH_DOUBLE(VScal); + +#undef AWALYS_USE_ME_WITH_DOUBLE } // namespace mkl } // namespace more } // namespace jit @@ -40,5 +97,12 @@ void VMul(const double* x, const double* y, double* z, int n) { namespace mkl = paddle::operators::jit::more::mkl; -REGISTER_JITKERNEL_MORE(vmul, mkl, mkl::VMulKernel, - mkl::VMulKernel); +#define REGISTER_MKL_KERNEL(key, func) \ + REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ + mkl::func##Kernel) + +REGISTER_MKL_KERNEL(vmul, VMul); +REGISTER_MKL_KERNEL(vadd, VAdd); +REGISTER_MKL_KERNEL(vscal, VScal); + +#undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 4173d1f3de..84a93f408f 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -16,7 +16,6 @@ #include #include "paddle/fluid/operators/jit/kernel_base.h" -#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -28,17 +27,27 @@ template void VMul(const T* x, const T* y, T* z, int n); template -class VMulKernel : public KernelImpl> { - public: - VMulKernel() { this->func = VMul; } - bool UseMe(int d) const override { - if (std::is_same::value) { - return platform::MayIUse(platform::avx512f) && d > 512; - } else { - return true; - } +void VAdd(const T* x, const T* y, T* z, int n); + +template +void VScal(const T* a, const T* x, T* y, int n); + +#define DECLARE_MKL_KERNEL(name, tuples) \ + template \ + class name##Kernel : public KernelImpl> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(typename tuples::attr_type) const override; \ } -}; + +// XYZN +DECLARE_MKL_KERNEL(VMul, XYZNTuples); +DECLARE_MKL_KERNEL(VAdd, XYZNTuples); + +// AXYN +DECLARE_MKL_KERNEL(VScal, AXYNTuples); + +#undef DECLARE_MKL_KERNEL } // namespace mkl } // namespace more diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index 8cf588efba..682e51e89d 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -31,56 +31,6 @@ namespace operators { namespace math { namespace jitkernel { -#ifdef PADDLE_WITH_MKLML -template -void VMulMKL(const T* x, const T* y, T* z, int n); - -template <> -void VMulMKL(const float* x, const float* y, float* z, int n) { - platform::dynload::vsMul(n, x, y, z); -} - -template <> -void VMulMKL(const double* x, const double* y, double* z, int n) { - platform::dynload::vdMul(n, x, y, z); -} - -template -void VAddMKL(const T* x, const T* y, T* z, int n); - -template <> -void VAddMKL(const float* x, const float* y, float* z, int n) { - platform::dynload::vsAdd(n, x, y, z); -} - -template <> -void VAddMKL(const double* x, const double* y, double* z, int n) { - platform::dynload::vdAdd(n, x, y, z); -} - -template -void VScalMKL(const T* a, const T* x, T* y, int n); - -template <> -void VScalMKL(const float* a, const float* x, float* y, int n) { - if (x == y) { - platform::dynload::cblas_sscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); - } -} - -template <> -void VScalMKL(const double* a, const double* x, double* y, int n) { - if (x == y) { - platform::dynload::cblas_dscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); - } -} - -#endif - /* VMUL JitKernel */ template class VMulKernelImpl : public VMulKernel { From 5fff20c21a1199f560a4d014b5a81decf3fce9e9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 17:29:00 +0800 Subject: [PATCH 339/719] Change name to huber loss test=develop --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/layers/nn.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c152a506d9..3ecc7af238 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -199,7 +199,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.huber_regression_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 28cb700f82..9922b3370d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -174,7 +174,7 @@ __all__ = [ 'get_tensor_from_selected_rows', 'lstm', 'psroi_pool', - 'huber_regression_loss', + 'huber_loss', ] kIgnoreIndex = -100 @@ -9180,7 +9180,7 @@ def psroi_pool(input, return out -def huber_regression_loss(input, label, delta): +def huber_loss(input, label, delta): """ Huber regression loss is a loss function used in robust regression. Huber regression loss can evaluate the fitness of input to label. @@ -9212,9 +9212,9 @@ def huber_regression_loss(input, label, delta): .. code-block:: python predictions = fluid.layers.softmax(x) - loss = fluid.layers.huber_regression_loss(input=predictions, label=label, 1.0) + loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0) """ - helper = LayerHelper('huber_regression_loss', **locals()) + helper = LayerHelper('huber_loss', **locals()) residual = helper.create_variable_for_type_inference( dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) From ea2a34ee957ded9d596c88360b929b273a39ceec Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 17:31:53 +0800 Subject: [PATCH 340/719] Polish doc test=develop --- python/paddle/fluid/layers/nn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9922b3370d..776fd0104a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9182,19 +9182,19 @@ def psroi_pool(input, def huber_loss(input, label, delta): """ - Huber regression loss is a loss function used in robust regression. - Huber regression loss can evaluate the fitness of input to label. - Different from MSE loss, Huber regression loss is more robust for outliers. + Huber loss is a loss function used in robust. + Huber loss can evaluate the fitness of input to label. + Different from MSE loss, Huber loss is more robust for outliers. When the difference between input and label is large than delta .. math:: - huber\_regression\_loss = delta * (label - input) - 0.5 * delta * delta + huber\_loss = delta * (label - input) - 0.5 * delta * delta When the difference between input and label is less than delta .. math:: - huber\_regression\_loss = 0.5 * (label - input) * (label - input) + huber\_loss = 0.5 * (label - input) * (label - input) Args: @@ -9202,11 +9202,11 @@ def huber_loss(input, label, delta): The first dimension is batch size, and the last dimension is 1. label (Variable): The groud truth whose first dimension is batch size and last dimension is 1. - delta (float): The parameter of huber regression loss, which controls + delta (float): The parameter of huber loss, which controls the range of outliers Returns: - huber\_regression\_loss (Variable): The huber regression loss with shape [batch_size, 1]. + huber\_loss (Variable): The huber loss with shape [batch_size, 1]. Examples: .. code-block:: python From 5fea8cd47809f56ef232dd187f480c251898c762 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 14 Dec 2018 18:26:33 +0800 Subject: [PATCH 341/719] Add sorted_result parameter to SelectedRows Functor test=develop --- .../operators/math/selected_rows_functor.cc | 17 ++++++++++------- .../operators/math/selected_rows_functor.cu | 3 ++- .../operators/math/selected_rows_functor.h | 16 ++++++---------- paddle/fluid/operators/optimizers/adam_op.h | 8 ++++++-- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 0c2e6d4024..1a11b584e2 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -253,23 +253,26 @@ elementwise_add_to(const DeviceContext& ctx, BlasT* blas, template struct MergeAdd { framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input) { + const framework::SelectedRows& input, + const bool sorted_result = false) { framework::SelectedRows out; - (*this)(context, input, &out); + (*this)(context, input, &out, sorted_result); return out; } void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { std::vector inputs; inputs.push_back(&input); - (*this)(context, inputs, output); + (*this)(context, inputs, output, sorted_result); } void operator()(const platform::CPUDeviceContext& context, const std::vector& inputs, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; @@ -302,8 +305,8 @@ struct MergeAdd { } std::vector merge_rows(merged_row_set.begin(), merged_row_set.end()); - if (sorted_result_) { - std::sort(merge_rows); + if (sorted_result) { + std::sort(merge_rows.begin(), merge_rows.end()); } std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index c4fccdbf86..b87c9461e8 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -266,7 +266,8 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, template struct MergeAdd { framework::SelectedRows operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input) { + const framework::SelectedRows& input, + const bool sorted_result = false) { framework::SelectedRows out; (*this)(context, input, &out); return out; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index b7b19f130e..222d761ef9 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -78,23 +78,19 @@ namespace scatter { // functors for manuplating SelectedRows data template struct MergeAdd { - MergeAdd() : sorted_result_(false) {} - - explicit MergeAdd(bool sorted_result) : sorted_result_(sorted_result) {} - // unary functor, merge by adding duplicated rows in // the input SelectedRows object. framework::SelectedRows operator()(const DeviceContext& context, - const framework::SelectedRows& input); + const framework::SelectedRows& input, + const bool sorted_result = false); void operator()(const DeviceContext& context, const framework::SelectedRows& input, - framework::SelectedRows* output); + framework::SelectedRows* output, + const bool sorted_result = false); void operator()(const DeviceContext& context, const std::vector& inputs, - framework::SelectedRows* output); - - private: - bool sorted_result_; + framework::SelectedRows* output, + const bool sorted_result = false); }; enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index c2bf7040d7..c9e27b7547 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -157,6 +157,9 @@ struct AdamFunctor { } }; +template +struct SparseAdamFunctor; + template struct SparseAdamFunctor { T beta1_; @@ -283,6 +286,7 @@ struct SparseAdamFunctor { // Calculation if (i == *(rows_ + j)) { + T g = grad_[j * row_numel_]; mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; ++j; @@ -388,12 +392,12 @@ class AdamOpKernel : public framework::OpKernel { } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func(true); + scatter::MergeAdd merge_func; auto* grad_merge_var = const_cast(ctx.scope()) .Var() ->GetMutable(); merge_func(ctx.template device_context(), grad, - grad_merge_var); + grad_merge_var, true); grad_merge_ptr = grad_merge_var; } From 67b555d3d3c98d571dffe5b2b8e1c0bae59bd80d Mon Sep 17 00:00:00 2001 From: mozga-intel Date: Fri, 14 Dec 2018 11:31:00 +0100 Subject: [PATCH 342/719] Enable ngraph tests for a ngraph engine (#14800) * Enable ngraph tests for a ngraph engine test=develop * Move the test structure to other place test=develop * Add USE_NGRAPH flag, simple structure test=develop --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 10 +++++++--- .../fluid/tests/unittests/ngraph/CMakeLists.txt | 6 ++++++ .../paddle/fluid/tests/unittests/ngraph/__init__.py | 13 +++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/ngraph/__init__.py diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index a4089ba3ca..6d6fe245d8 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -63,9 +63,9 @@ function(py_test_modules TARGET_NAME) set(multiValueArgs MODULES DEPS ENVS) cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} - ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} + ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (py_test_modules_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) endif() @@ -111,3 +111,7 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() + +if (WITH_NGRAPH) + add_subdirectory(ngraph) +endif() diff --git a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt new file mode 100644 index 0000000000..5ed2d0aa80 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS FLAGS_use_ngraph=true) +endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/ngraph/__init__.py b/python/paddle/fluid/tests/unittests/ngraph/__init__.py new file mode 100644 index 0000000000..b94a21a7e4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 5e97be7ba774c5b109c6674ba41d8ba1bfd18229 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 10:10:41 +0000 Subject: [PATCH 343/719] enable jitkernel mkl vexp, vsigmoid and vtanh --- paddle/fluid/operators/jit/gen/jitcode.h | 4 -- paddle/fluid/operators/jit/helper.h | 4 -- paddle/fluid/operators/jit/kernel_base.h | 1 + paddle/fluid/operators/jit/macro.h | 32 +++++++++++++++ .../operators/jit/more/mkl/CMakeLists.txt | 3 ++ paddle/fluid/operators/jit/more/mkl/mkl.cc | 31 ++++++++++++++ paddle/fluid/operators/jit/more/mkl/mkl.h | 33 +++++++++++++++ paddle/fluid/operators/jit/test.cc | 2 +- paddle/fluid/operators/math/jit_kernel_exp.cc | 41 ------------------- 9 files changed, 101 insertions(+), 50 deletions(-) create mode 100644 paddle/fluid/operators/jit/macro.h diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 765952fc35..64126e3f61 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -56,10 +56,6 @@ typedef enum { identity } operand_type; -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - #define DECLARE_JIT_CODE(codename) \ const char* name() const override { return #codename; } diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 3431c22111..44952fb907 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -27,10 +27,6 @@ namespace paddle { namespace operators { namespace jit { -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - template inline typename std::enable_if< std::is_same::value && diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 00d583c60b..f10d9f3fdd 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/platform/macros.h" namespace paddle { diff --git a/paddle/fluid/operators/jit/macro.h b/paddle/fluid/operators/jit/macro.h new file mode 100644 index 0000000000..b2622eba8b --- /dev/null +++ b/paddle/fluid/operators/jit/macro.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace operators { +namespace jit { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index ffecb73297..3ecb520392 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -6,3 +6,6 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE USE_JITKERNEL_MORE(vmul, mkl) USE_JITKERNEL_MORE(vadd, mkl) USE_JITKERNEL_MORE(vscal, mkl) +USE_JITKERNEL_MORE(vexp, mkl) +USE_JITKERNEL_MORE(vsigmoid, mkl) +USE_JITKERNEL_MORE(vtanh, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 3d963cbf1d..42f6df576b 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -62,6 +62,16 @@ void VScal(const double* a, const double* x, double* y, int n) { } } +template <> +void VExp(const float* x, float* y, int n) { + platform::dynload::vsExp(n, x, y); +} + +template <> +void VExp(const double* x, double* y, int n) { + platform::dynload::vdExp(n, x, y); +} + // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool VMulKernel::UseMe(int d) const { @@ -78,6 +88,21 @@ bool VScalKernel::UseMe(int d) const { return platform::MayIUse(platform::avx512f) && d > 512; } +template <> +bool VExpKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VSigmoidKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VTanhKernel::UseMe(int d) const { + return d > 7; +} + #define AWALYS_USE_ME_WITH_DOUBLE(func) \ template <> \ bool func##Kernel::UseMe(int d) const { \ @@ -87,6 +112,9 @@ bool VScalKernel::UseMe(int d) const { AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VScal); +AWALYS_USE_ME_WITH_DOUBLE(VExp); +AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); +AWALYS_USE_ME_WITH_DOUBLE(VTanh); #undef AWALYS_USE_ME_WITH_DOUBLE } // namespace mkl @@ -104,5 +132,8 @@ namespace mkl = paddle::operators::jit::more::mkl; REGISTER_MKL_KERNEL(vmul, VMul); REGISTER_MKL_KERNEL(vadd, VAdd); REGISTER_MKL_KERNEL(vscal, VScal); +REGISTER_MKL_KERNEL(vexp, VExp); +REGISTER_MKL_KERNEL(vsigmoid, VSigmoid); +REGISTER_MKL_KERNEL(vtanh, VTanh); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 84a93f408f..bf209d2f9d 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -32,6 +32,34 @@ void VAdd(const T* x, const T* y, T* z, int n); template void VScal(const T* a, const T* x, T* y, int n); +template +void VExp(const T* x, T* y, int n); + +template +void VSigmoid(const T* x, T* y, int n) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + VExp(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } +} + +template +void VTanh(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + #define DECLARE_MKL_KERNEL(name, tuples) \ template \ class name##Kernel : public KernelImpl> { \ @@ -47,6 +75,11 @@ DECLARE_MKL_KERNEL(VAdd, XYZNTuples); // AXYN DECLARE_MKL_KERNEL(VScal, AXYNTuples); +// XYN +DECLARE_MKL_KERNEL(VExp, XYNTuples); +DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); +DECLARE_MKL_KERNEL(VTanh, XYNTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 62d4cdc19a..e211276d18 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -312,7 +312,7 @@ void TestXYNKernel() { std::vector x(d), yref(d); std::vector xinp(d); // inplace test - RandomVec(d, x.data()); + RandomVec(d, x.data(), -2.f, 2.f); std::copy(x.begin(), x.end(), xinp.begin()); const T* x_data = x.data(); diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 7945cfb253..1f97ed1e62 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -30,47 +30,6 @@ namespace operators { namespace math { namespace jitkernel { -#ifdef PADDLE_WITH_MKLML -// try to use MKL to speedup -template -void VExpMKL(const T* x, T* y, int n); - -template <> -void VExpMKL(const float* x, float* y, int n) { - platform::dynload::vsExp(n, x, y); -} - -template <> -void VExpMKL(const double* x, double* y, int n) { - platform::dynload::vdExp(n, x, y); -} - -template -void VSigmoidMKL(const T* x, T* y, int n) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - VExpMKL(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -template -void VTanhMKL(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoidMKL(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} -#endif - /* VExp JitKernel */ template class VExpKernelImpl : public VExpKernel { From a985949be99266a12003071bcadec2d9f7785d58 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Fri, 14 Dec 2018 19:21:40 +0800 Subject: [PATCH 344/719] Fea/fuse conv elementwise add fuse (#14669) --- paddle/fluid/framework/ir/CMakeLists.txt | 2 + .../ir/conv_elementwise_add2_act_fuse.cc | 106 ++++++++++++++++ .../ir/conv_elementwise_add2_act_fuse_pass.cc | 105 ++++++++++++++++ .../ir/conv_elementwise_add2_act_fuse_pass.h | 33 +++++ .../ir/conv_elementwise_add_act_fuse_pass.cc | 104 ++++++++++++++++ .../ir/conv_elementwise_add_act_fuse_pass.h | 33 +++++ .../framework/ir/graph_pattern_detector.cc | 113 +++++++++++++++++- .../framework/ir/graph_pattern_detector.h | 45 +++++++ .../api/analysis_predictor_tester.cc | 7 +- .../fluid/inference/api/paddle_pass_builder.h | 5 +- paddle/fluid/inference/io.cc | 2 +- .../inference/tests/api/trt_models_tester.cc | 25 +++- .../operators/controlflow/CMakeLists.txt | 2 +- paddle/fluid/operators/conv_op.cc | 4 +- paddle/fluid/platform/device_context.cc | 1 + 15 files changed, 580 insertions(+), 7 deletions(-) create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 883575e41d..be4151b54b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -42,6 +42,8 @@ pass_library(multi_batch_merge_pass base) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(is_test_pass base) +pass_library(conv_elementwise_add_act_fuse_pass inference) +pass_library(conv_elementwise_add2_act_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc new file mode 100644 index 0000000000..6e9905b7ec --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // ResidualData + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc new file mode 100644 index 0000000000..23f343f631 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, conv_op); // Input + IR_NODE_LINK_TO(conv_filter, conv_op); // Filter + IR_NODE_LINK_TO(conv_op, conv_out); // Output + IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h new file mode 100644 index 0000000000..3b40a5a926 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAdd2ActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAdd2ActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc new file mode 100644 index 0000000000..fe3b4fca79 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& activation, const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + desc.Flush(); + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = + PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op, + elementwise_add_out, act_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAddActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h new file mode 100644 index 0000000000..ac69aa6458 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 0118019df2..bf12d12459 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -17,6 +17,7 @@ #include #include +#include "graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" @@ -25,6 +26,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" + namespace paddle { namespace framework { namespace ir { @@ -104,7 +106,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { for (auto &node : GraphTraits::DFS(graph)) { for (const auto &pdnode : pattern_.nodes()) { if (pdnode->Tell(&node)) { - VLOG(4) << "pdnode " << pdnode->name() << " marked"; + VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name(); pdnodes2nodes_[pdnode.get()].insert(&node); } } @@ -1099,6 +1101,115 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } + +std::unordered_set conv_act_set({"identity", "sigmoid", "relu", + "relu6", "relux", "tanh", + "band_pass"}); + +PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out}); + + return act_out; +} + +PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + + auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + elementwise_add_op_1->LinksFrom( + {elementwise_add_out, elementwise_add_in_y_1}); + act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); + return act_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d044802f22..0fee2f1c18 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -671,6 +671,51 @@ struct ElementwiseAdd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_y); PATTERN_DECL_NODE(elementwise_add_out); }; + +// Conv + ElementwiseAdd + an activation +// This pattern can futher fuse the conv related ops after the conv+bn fusion. +struct ConvElementwiseaddAct : public PatternBase { + ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + +// Conv + ElementwiseAdd + ElementwiseAdd + Activation +struct ConvElementwiseadd2Act : public PatternBase { + ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, + "conv_elementwiseadd2_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_out); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(elementwise_add_op_1); + PATTERN_DECL_NODE(elementwise_add_in_y_1); // input + PATTERN_DECL_NODE(elementwise_add_out_1); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index d67305670c..a361b34437 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) { } TEST(AnalysisPredictor, analysis_on) { - AnalysisConfig config(false); +#ifdef PADDLE_WITH_CUDA + AnalysisConfig config(true); + config.fraction_of_gpu_memory = 0.15; +#else + AnalysisConfig config; +#endif config.model_dir = FLAGS_dirname; config.enable_ir_optim = true; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index bc5139a7e5..e6e7de2478 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,7 +118,10 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", "conv_bn_fuse_pass", + "infer_clean_graph_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // }); } diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 24d15f12f9..ae72a74acc 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, for (auto* var : global_block.AllVars()) { if (IsPersistable(var)) { - VLOG(3) << "persistable variable's name: " << var->Name(); + VLOG(4) << "persistable variable's name: " << var->Name(); framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 9eb3fb5da1..d3bd035c1c 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { contrib::AnalysisConfig config(true); + config.pass_builder()->TurnOnDebug(); SetConfig(&config, model_dir, true, use_tensorrt, FLAGS_batch_size); TestPrediction(reinterpret_cast(&config), @@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) { profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); } +TEST(resnext50, compare_analysis_native) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + compare(model_dir, false /*use tensorrt*/); +} + TEST(TensorRT_mobilenet, analysis) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; - compare(model_dir, /* use_tensorrt */ false); + compare(model_dir, false /* use_tensorrt */); +} + +TEST(AnalysisPredictor, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + AnalysisConfig config(true); + config.model_dir = model_dir; + config.fraction_of_gpu_memory = 0.15; + config.pass_builder()->TurnOnDebug(); + + std::vector> inputs_all; + auto predictor = CreatePaddlePredictor(config); + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + + std::vector outputs; + for (auto& input : inputs_all) { + ASSERT_TRUE(predictor->Run(input, &outputs)); + } } } // namespace inference diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b1c2ee2295..b614e9b035 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,4 @@ include(operators) -register_operators() +register_operators(DEPS naive_executor) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index d7b8766288..b09e527b90 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { std::vector dilations = ctx->Attrs().Get>("dilations"); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, - "Conv intput should be 4-D or 5-D tensor."); + "Conv intput should be 4-D or 5-D tensor, get %u", + in_dims.size()); + PADDLE_ENFORCE_EQ( in_dims.size(), filter_dims.size(), "Conv input dimension and filter dimension should be the same."); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index bd81d4dd1f..d2e23d80f4 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -3,6 +3,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. From 787d837f503a43f5bd2d8dfe5e5c2417a55084c7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 14 Dec 2018 19:35:48 +0800 Subject: [PATCH 345/719] fix test=develop --- paddle/scripts/paddle_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a1c1886c7f..0fc43f33d0 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -518,7 +518,7 @@ function assert_api_spec_approvals() { fi done - HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast` + HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast || true` if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` From fd0a954fbf081021b6b71c03d274ee3ea870ec6c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 11:47:43 +0000 Subject: [PATCH 346/719] enable blas jitcode vmul, vadd, vaddrelu, vscal and vaddbias --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 5 +++ paddle/fluid/operators/jit/gen/blas.cc | 38 +++++++++++++------ paddle/fluid/operators/jit/gen/blas.h | 25 +++++++++--- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 98d9231faa..ef74a7118b 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -10,3 +10,8 @@ endfunction() # use gen jitcode kernel by name USE_JITKERNEL_GEN(vmul) +USE_JITKERNEL_GEN(vadd) +#USE_JITKERNEL_GEN(vsub) # TODO(TJ): enable me +USE_JITKERNEL_GEN(vaddrelu) +USE_JITKERNEL_GEN(vscal) +USE_JITKERNEL_GEN(vaddbias) diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 3e5ce54064..b24f44c9f3 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -104,18 +104,28 @@ void VXXJitCode::genCode() { ret(); } -class VMulCreator : public JitCodeCreator { - public: - bool UseMe(const int& attr) const override { - return platform::MayIUse(platform::avx); +#define DECLARE_BLAS_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + bool UseMe(const int& attr) const override { \ + return platform::MayIUse(platform::avx); \ + } \ + size_t CodeSize(const int& d) const override { \ + return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ + } \ + std::unique_ptr CreateJitCode(const int& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ } - size_t CodeSize(const int& d) const override { - return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; - } - std::unique_ptr CreateJitCode(const int& attr) const override { - return make_unique(attr, CodeSize(attr)); - } -}; + +DECLARE_BLAS_CREATOR(VMul); +DECLARE_BLAS_CREATOR(VAdd); +DECLARE_BLAS_CREATOR(VSub); +DECLARE_BLAS_CREATOR(VAddRelu); +DECLARE_BLAS_CREATOR(VScal); +DECLARE_BLAS_CREATOR(VAddBias); + +#undef DECLARE_BLAS_CREATOR } // namespace gen } // namespace jit @@ -125,3 +135,9 @@ class VMulCreator : public JitCodeCreator { namespace gen = paddle::operators::jit::gen; REGISTER_JITKERNEL_GEN(vmul, gen::VMulCreator); +REGISTER_JITKERNEL_GEN(vadd, gen::VAddCreator); +// TODO(TJ): enable sub +// REGISTER_JITKERNEL_GEN(vsub, gen::VSubCreator); +REGISTER_JITKERNEL_GEN(vaddrelu, gen::VAddReluCreator); +REGISTER_JITKERNEL_GEN(vscal, gen::VScalCreator); +REGISTER_JITKERNEL_GEN(vaddbias, gen::VAddBiasCreator); diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index 60f3280567..5a2192052f 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -15,6 +15,7 @@ #pragma once #include +#include "glog/logging.h" #include "paddle/fluid/operators/jit/gen/jitcode.h" namespace paddle { @@ -33,6 +34,9 @@ class VXXJitCode : public JitCode { type_(type), scalar_index_(scalar_index), with_relu_(with_relu) { + if (!(type_ == operand_type::mul || type_ == operand_type::add)) { + LOG(FATAL) << "Do not support this operand type: " << type_; + } this->genCode(); } @@ -78,11 +82,22 @@ class VXXJitCode : public JitCode { ymm_t ymm_zero = ymm_t(3); }; -class VMulJitCode : public VXXJitCode { - public: - explicit VMulJitCode(int d, size_t code_size, void* code_ptr = nullptr) - : VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {} -}; +#define DECLARE_BLAS_JITCODE(name, op_type, scalar_idx, with_relu) \ + class name##JitCode : public VXXJitCode { \ + public: \ + explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ + : VXXJitCode(d, op_type, scalar_idx, with_relu, code_size, code_ptr) { \ + } \ + }; + +DECLARE_BLAS_JITCODE(VMul, operand_type::mul, 0, false); +DECLARE_BLAS_JITCODE(VAdd, operand_type::add, 0, false); +DECLARE_BLAS_JITCODE(VSub, operand_type::sub, 0, false); +DECLARE_BLAS_JITCODE(VAddRelu, operand_type::add, 0, true); +DECLARE_BLAS_JITCODE(VScal, operand_type::mul, 1, false); +DECLARE_BLAS_JITCODE(VAddBias, operand_type::add, 1, false); + +#undef DECLARE_BLAS_JITCODE } // namespace gen } // namespace jit From 37c2e24511a29a2b23e18869b51f8edf805cead3 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 14 Dec 2018 19:48:34 +0800 Subject: [PATCH 347/719] Update README.md --- README.md | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/README.md b/README.md index c535e9514e..32a302cc54 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + + ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` @@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` + +### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.2.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==1.2.0.post85 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` + + ## Features - **Flexibility** @@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85 Baidu and it has achieved a significant impact. We hope you can also explore the capability of PaddlePaddle to make an impact on your product. +## 特点 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **连接产品** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + ## Installation It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. +## 安装 + +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) + ## Documentation We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and @@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte We appreciate your contributions! +## 文档 + +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). +## 答疑 + +欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 + ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 From 0b1c7d838cfeb2e2000839f173a9be2d641f3d47 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 14 Dec 2018 20:01:00 +0800 Subject: [PATCH 348/719] Add brpc serialization support. (#11430) --- benchmark/fluid/fluid_benchmark.py | 4 +- cmake/external/brpc.cmake | 20 +- cmake/external/gtest.cmake | 10 +- cmake/external/leveldb.cmake | 4 +- paddle/fluid/framework/CMakeLists.txt | 9 +- paddle/fluid/framework/details/CMakeLists.txt | 11 +- paddle/fluid/framework/executor.cc | 6 +- .../operators/distributed/CMakeLists.txt | 31 +- .../operators/distributed/brpc_client.cc | 371 +++++++++++++++--- .../fluid/operators/distributed/brpc_client.h | 99 ++++- .../operators/distributed/brpc_rdma_pool.cc | 84 ++++ .../operators/distributed/brpc_rdma_pool.h | 56 +++ .../distributed/brpc_sendrecvop_utils.cc | 196 +++++++++ .../distributed/brpc_sendrecvop_utils.h | 49 +++ .../operators/distributed/brpc_serde_test.cc | 175 +++++++++ .../operators/distributed/brpc_server.cc | 264 +++++++++++-- .../distributed/brpc_variable_response.cc | 73 ++++ .../distributed/brpc_variable_response.h | 67 ++++ .../operators/distributed/grpc_client.cc | 3 +- .../fluid/operators/distributed/grpc_serde.cc | 7 - .../fluid/operators/distributed/rpc_server.h | 4 + .../operators/distributed/sendrecvop_utils.cc | 2 +- .../operators/distributed/sendrecvop_utils.h | 7 + .../operators/distributed_ops/CMakeLists.txt | 4 +- .../distributed_ops/listen_and_serv_op.cc | 7 +- .../operators/distributed_ops/send_op.cc | 2 + paddle/fluid/pybind/pybind.cc | 9 + python/paddle/fluid/__init__.py | 1 + 28 files changed, 1422 insertions(+), 153 deletions(-) create mode 100644 paddle/fluid/operators/distributed/brpc_rdma_pool.cc create mode 100644 paddle/fluid/operators/distributed/brpc_rdma_pool.h create mode 100644 paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc create mode 100644 paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h create mode 100644 paddle/fluid/operators/distributed/brpc_serde_test.cc create mode 100644 paddle/fluid/operators/distributed/brpc_variable_response.cc create mode 100644 paddle/fluid/operators/distributed/brpc_variable_response.h diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 5f3ce300ac..10b633a4fc 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") - config = distribute_transpiler.DistributeTranspilerConfig() + config = fluid.DistributeTranspilerConfig() config.slice_var_up = not args.no_split_var + config.min_block_size = 1048576 t = distribute_transpiler.DistributeTranspiler(config=config) + t.transpile( trainer_id, # NOTE: *MUST* use train_prog, for we are using with guard to diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 30b227b645..6b50cff7a6 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -14,14 +14,16 @@ INCLUDE(ExternalProject) -find_library(SSL_LIBRARY NAMES ssl) +find_package(OpenSSL REQUIRED) + +message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) +message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) + ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY}) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) -find_library(CRYPTO_LIBRARY NAMES crypto) ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY}) - +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) @@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args -set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib") +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} + # TODO(gongwb): change to de newst repo when they changed. GIT_REPOSITORY "https://github.com/gongweibao/brpc" - GIT_TAG "7dc04defad1fd4173aae170c3fcbde131b65155a" + GIT_TAG "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -50,7 +53,7 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_PREFIX_PATH=${prefix_path} - -DBRPC_WITH_GLOG=ON + -DWITH_GLOG=ON -DIOBUF_WITH_HUGE_BLOCK=ON -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} ${EXTERNAL_OPTIONAL_ARGS} @@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ADD_DEPENDENCIES(brpc extern_brpc) +add_definitions(-DBRPC_WITH_GLOG) LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 4fe9c13fb7..9be625b620 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -IF(WITH_TESTING) - ENABLE_TESTING() +#FIXME:(gongwb) Move brpc's gtest dependency. +IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) + IF(WITH_TESTING) + ENABLE_TESTING() + ENDIF(WITH_TESTING) + INCLUDE(ExternalProject) SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest) @@ -76,4 +80,4 @@ IF(WITH_TESTING) ADD_DEPENDENCIES(gtest_main extern_gtest) LIST(APPEND external_project_dependencies gtest gtest_main) -ENDIF(WITH_TESTING) +ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index fb5091731d..0df61b01ab 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -24,8 +24,8 @@ ExternalProject_Add( extern_leveldb ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${LEVELDB_SOURCES_DIR} - URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" - URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + GIT_REPOSITORY "https://github.com/google/leveldb" + GIT_TAG v1.18 CONFIGURE_COMMAND "" BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d7a69c8c9..cea4a44857 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -169,9 +169,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog + lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper) + + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + else() if(WITH_NGRAPH) if(NOT WIN32) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a927a3afcd..97f7713d97 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) +if(WITH_DISTRIBUTE) + if(NOT WITH_GRPC) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + endif() +endif() + if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) if(WITH_DISTRIBUTE) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor sendrecvop_grpc) + ddim dynload_cuda selected_rows_functor sendrecvop_rpc) else() nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda selected_rows_functor) @@ -30,7 +37,7 @@ else() variable_visitor) if(WITH_DISTRIBUTE) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim selected_rows_functor sendrecvop_grpc) + ddim selected_rows_functor sendrecvop_rpc) else() cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim selected_rows_functor) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 0c4bd336c5..8c3912120b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -157,9 +157,9 @@ void Executor::Close() { #ifdef PADDLE_WITH_DISTRIBUTE // TODO(typhoonzero): complete message will need to use real trainer_id, // except 0. - ::paddle::operators::distributed::RPCClient::GetInstance< - ::paddle::operators::distributed::GRPCClient>(0) - ->SendComplete(); + auto client = + paddle::operators::distributed::RPCClient::GetInstance(0); + client->SendComplete(); #endif } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 101dbe9c89..eab4297c73 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") if(WITH_GRPC) - grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc + grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows_functor memory) @@ -20,36 +20,43 @@ if(WITH_GRPC) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor selected_rows_functor scope math_function SERIAL) endif() - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) else() - set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc - brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc + brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc + collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc + brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc + brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) - set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) + set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor + proto_desc lookup_sparse_table_op snappystream snappy zlib) - cc_test(brpc_server_test SRCS rpc_server_test.cc + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${brpc_test_depends} SERIAL) cc_test(brpc_serde_test SRCS brpc_serde_test.cc DEPS ${brpc_test_depends} SERIAL) + + if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS ${brpc_test_depends} selected_rows_functor scope math_function SERIAL) + endif() endif() diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc index 350969f74b..62e32977b8 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -14,135 +14,316 @@ #include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace distributed { -DEFINE_int32(brpc_channel_num, 24, - "Number of channels to send requests connected to one server"); DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); BRPCClient::~BRPCClient() { Wait(); } -void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response) { +void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used by other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to send SendVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleSendResponse"; } -bool BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); + const std::string method = "SendRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage request; + distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, + &cntl->request_attachment(), "", false, + trainer_id_); - google::protobuf::Closure* done = - brpc::NewCallback(&HandleSendResponse, cntl, response); + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - sendrecv::VariableMessage request; - ch_ctx->stub->SendVariable(cntl, &request, response, done); - }); + platform::RecordRPCEvent record_event(method, p_ctx); + + ch_ctx->stub->SendVariable(cntl, &request, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } +void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { + LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); + return; + } + + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleFetchBarrierResponse"; +} void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response) { + sendrecv::VariableMessage* response, VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, + BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to GetVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + cls->DecreaseReqCount(); + var_h->Finish(false); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; - // framework::Variable* outvar = nullptr; - // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); + VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + + framework::Variable* outvar = nullptr; + int trainer_id; + distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), + *var_h->ctx(), var_h->scope(), &outvar, + &trainer_id); + VLOG(4) << "Finish HandleGetResponse"; + cls->DecreaseReqCount(); + var_h->Finish(true); } -bool BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + const std::string method = "GetRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_trainer_id(trainer_id_); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + platform::RecordRPCEvent record_event(method, p_ctx); + + if (method_name == "GetMonomerVariable") { + ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else { + ch_ctx->stub->GetVariable(cntl, &req, response, done); + } + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; +} + +VarHandlePtr BRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); } -bool BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; + const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + + const std::string method = "PrefetchRPC"; + + VarHandlePtr var_h( + new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + auto* var = p_scope->FindVar(in_var_name_val); + sendrecv::VariableMessage req; + distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, + &cntl->request_attachment(), out_var_name_val, + false, 0, table_name_val); + + platform::RecordRPCEvent record_event(method, p_ctx); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] {}); + ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } -void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - req_count_++; +VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, + time_out); } -void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + sendrecv::VariableMessage req; + req.set_varname(FETCH_BARRIER_MESSAGE); + + const std::string method = "FetchBarrierRPC"; + // var handle + VarHandlePtr var_h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + + platform::RecordRPCEvent record_event(method, nullptr); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + ch_ctx->stub->GetVariable(cntl, &req, response, done); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; } -void BRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +bool BRPCClient::Wait() { + VLOG(9) << "begin to brpcclient wait"; + { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); + } + VLOG(9) << "end to brpcclient wait"; + return true; } ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + VLOG(4) << "begin to GetChannel:" << ep; { std::lock_guard guard(chan_mutex_); auto it = channels_.find(ep); if (it != channels_.end()) { + VLOG(4) << "end to GetChannel:" << ep; return it->second; } } @@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { ChannelQueuePtr q(new framework::BlockingQueue()); brpc::ChannelOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.protocol = "baidu_std"; - options.connection_type = "pooled"; - options.connect_timeout_ms = 100; + // don't use pooled type. the server can't afford that. + options.connection_type = "single"; + options.connect_timeout_ms = 1000; options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; options.max_retry = FLAGS_max_retry; - for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + + VLOG(1) << "create " << brpc_channel_num_per_server_ + << " brpc channels to pserver:" << ep; + + for (int i = 0; i < brpc_channel_num_per_server_; ++i) { std::shared_ptr c(new ChannelContext()); if (c->channel.Init(ep.c_str(), &options) != 0) { LOG(FATAL) << "Fail to initialize channel"; @@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { channels_[ep] = q; } + VLOG(4) << "end to GetChannel:" << ep; return q; } +VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); +} + +void BRPCClient::SendComplete() { + for (auto& kv : channels_) { + AsyncSendComplete(kv.first); + } +} + +VarHandlePtr BRPCClient::AsyncSendVarMessage( + const std::string& ep, const std::string& method_name, + const sendrecv::VariableMessage& req, int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + platform::RecordRPCEvent record_event(method_name, nullptr); + + VarHandlePtr var_h( + new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + if (method_name == "CheckPointNotifyRPC") { + ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); + } else if (method_name == "GetMonomerBarrier") { + ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); + } else { + ch_ctx->stub->SendVariable(cntl, &req, response, done); + } + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; +} + +VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(message); + + return AsyncSendVarMessage(ep, method_name, req, time_out); +} + +VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(CHECKPOINT_SAVE_MESSAGE); + req.set_out_varname(dir); + + return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); +} + } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h index 8ff1f0a607..80cc81bff3 100644 --- a/paddle/fluid/operators/distributed/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc_client.h @@ -31,6 +31,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -53,33 +55,94 @@ class BRPCClient : public RPCClient { BRPCClient() {} virtual ~BRPCClient(); - bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; - void Wait() override; + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + bool Wait() override; + + void SendComplete() override; private: + VarHandlePtr _AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out = FLAGS_rpc_deadline); + void Proceed(); ChannelQueuePtr GetChannel(const std::string& ep); + VarHandlePtr AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline); + + VarHandlePtr AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, int64_t time_out); + + VarHandlePtr AsyncSendVarMessage(const std::string& ep, + const std::string& method_name, + const sendrecv::VariableMessage& req, + int64_t time_out); + + friend void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, + BRPCClient* cls); + void DecreaseReqCount() { + if (--req_count_ <= 0) { + sync_cond_.notify_all(); + } + } + private: std::unordered_map channels_; @@ -88,6 +151,8 @@ class BRPCClient : public RPCClient { std::condition_variable sync_cond_; std::atomic req_count_{0}; + static constexpr int brpc_channel_num_per_server_ = 4; + // mutex for GetChannel thread safety std::mutex chan_mutex_; DISABLE_COPY_AND_ASSIGN(BRPCClient); diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc new file mode 100644 index 0000000000..e1be5673df --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_BRPC_RDMA + +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "brpc/channel.h" +#include "brpc/rdma/rdma_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +RdmaMemPool& RdmaMemPool::Instance() { + static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); + return *g_rdma_mem_pool; +} + +void* RdmaMemPool::Find(const std::string& varname, int64_t size) { + pthread_rwlock_rdlock(&access_); + auto it = pool_.find(varname); + if (it == pool_.end()) { + pthread_rwlock_unlock(&access_); + return nullptr; + } + + auto info = it->second; + if (info.data_size != size) { + pthread_rwlock_unlock(&access_); + PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size, + info.data_size); + return nullptr; + } + + pthread_rwlock_unlock(&access_); + return info.data; +} + +void RdmaMemPool::Register(const std::string& varname, void* data, + int64_t data_size) { + void* old = Find(varname, data_size); + if (old != nullptr) { + if (data != old) { + PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old); + } + VLOG(7) << "Find on rdma:" << varname << " data:" << data + << " data_size:" << data_size; + return; + } + + VarInfo info; + info.data = data; + info.data_size = data_size; + + pthread_rwlock_wrlock(&access_); + pool_[varname] = info; + pthread_rwlock_unlock(&access_); + + if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { + LOG(FATAL) << "register " << varname << " data:" << data + << " data_size:" << data_size << " error"; + } + + VLOG(4) << "register on rdma:" << varname << " data:" << data + << " data_size:" << data_size; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc_rdma_pool.h new file mode 100644 index 0000000000..156a93ec57 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_BRPC_RDMA + +#include // NOLINT +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +/* + * This class is used to avoid duplicated registion of brpc::rdma. + */ +class RdmaMemPool { + public: + static RdmaMemPool& Instance(); + RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} + + virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } + + void Register(const std::string& varname, void* data, int64_t size); + void* Find(const std::string& varname, int64_t size); + + private: + struct VarInfo { + void* data; + int64_t data_size; + + VarInfo() : data(nullptr), data_size(0) {} + }; + + private: + std::unordered_map pool_; + pthread_rwlock_t access_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc new file mode 100644 index 0000000000..6fed9ba92c --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include // NOLINT + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class IOBufWriter { + public: + static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + iobuf->append(v, vlen); + } + + static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, + int64_t vlen, bool in_cuda_pinned, + void (*destroy)(void*), void* user_data) { + VLOG(7) << "AppendTCPZeroCopy " + << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + // FIXME(gongwb): use append_zerocopy + /* + if (in_cuda_pinned) { + iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); + } else { + iobuf->append_zerocopy(v, vlen, nullptr); + } + */ + iobuf->append(v, vlen); + destroy(user_data); + } + +#ifdef PADDLE_WITH_BRPC_RDMA + static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { + VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + RdmaMemPool::Instance().Register( + varname, static_cast(const_cast(v)), vlen); + + // FIXME(gongwb): use append_zerocopy + // iobuf->append_zerocopy(v, vlen, nullptr); + iobuf->append(v, vlen); + destroy(user_data); + return; + } +#endif + + static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { +#ifdef PADDLE_WITH_BRPC_RDMA + IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, + destroy, user_data); +#else + IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, + user_data); +#endif + } +}; + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, int trainer_id, + const std::string& table_name) { + std::unique_ptr payload; + + request->set_varname(name); + request->set_trainer_id(trainer_id); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request->set_profile(platform::kEnableProfiler); + } else { + request->set_profile(platform::kDisableProfiler); + } + } + if (!out_varname.empty()) { + request->set_out_varname(out_varname); + } + if (!table_name.empty()) { + request->set_table_name(table_name); + } + if (var->IsType()) { + request->set_type(::sendrecv::LOD_TENSOR); + payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); + } else if (var->IsType()) { + request->set_type(::sendrecv::SELECTED_ROWS); + payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); +#ifdef PADDLE_WITH_CUDA + } else if (var->IsType()) { + request->set_type(::sendrecv::NCCL_ID); + const ncclUniqueId& uid = var->Get(); + // TODO(gongwb): use append_zero to avoid data copy. + IOBufWriter::Append(iobuf, + sendrecv::VariableMessage::kSerializedFieldNumber, + uid.internal, NCCL_UNIQUE_ID_BYTES); + return; +#endif + } else { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + } + + PADDLE_ENFORCE_NOT_NULL(payload); + + // FIXME(gongwb): it seems that can use zero copy. + if (var_is_not_stable) { + IOBufWriter::Append( + iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size()); + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + true, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); +#endif + } else { + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + false, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); + } + } + + if (var->IsType()) { + auto* slr = var->GetMutable(); + size_t rows_memory_size = + slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + + IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, + reinterpret_cast(slr->rows().data()), + static_cast(rows_memory_size)); + } +} + +void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, + const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id) { + operators::distributed::BRPCVariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!"); + *var = resp.GetVar(); + *trainer_id = resp.GetTrainerId(); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h new file mode 100644 index 0000000000..ffaf442224 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, const int trainer_id = 0, + const std::string& table_name = std::string()); + +void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc_serde_test.cc new file mode 100644 index 0000000000..2a2dc72150 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "brpc/channel.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 564 * 128; + + // serialize var to IOBuf + { + framework::Variable var; + auto* slr = var.GetMutable(); + slr->set_height(1000); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({564, 128})); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 564; ++i) rows->push_back(i); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // desrialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto* slr2 = var2->GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + for (size_t i = 0; i < rows2->size(); ++i) { + EXPECT_EQ(rows_data2[i], static_cast(i)); + } + EXPECT_EQ(slr2->height(), 1000); + } +} + +void RunTestLodTensor(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 512 * 8 * 4 * 2; + { + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({512, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // check sendrecv::VariableMessage meta data + { + EXPECT_EQ(msg.varname(), "myvar"); + EXPECT_EQ(msg.type(), 0); + EXPECT_EQ(msg.dims()[0], 512); + EXPECT_EQ(msg.dims()[1], 8); + EXPECT_EQ(msg.dims()[2], 4); + EXPECT_EQ(msg.dims()[3], 2); + EXPECT_EQ(msg.lod_level(), 1); + EXPECT_EQ(msg.lod(0).lod_data(0), 1); + EXPECT_EQ(msg.lod(0).lod_data(1), 3); + EXPECT_EQ(msg.lod(0).lod_data(2), 8); + } + + // deserialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto tensor2 = var2->Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + for (int i = 0; i < tensor_numel; ++i) + EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); + } +} + +TEST(LodTensor, Run) { + platform::CPUPlace place; + RunTestLodTensor(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu(0); + RunTestLodTensor(gpu); +#endif +} + +TEST(SelectedRows, Run) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu; + RunSerdeTestSelectedRows(gpu); +#endif +} diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc index 862167f020..78d41aeac5 100644 --- a/paddle/fluid/operators/distributed/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc_server.cc @@ -13,84 +13,287 @@ // limitations under the License. #include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/request_handler.h" namespace sendrecv { -typedef std::unordered_map +namespace distributed = paddle::operators::distributed; + +typedef std::unordered_map HandlerMap; class BRPCServiceImpl : public SendRecvService { public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) - : request_send_h_(nullptr), - request_get_h_(nullptr), - request_prefetch_h_(nullptr) { - auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, + distributed::RPCServer* rpc_server) + : rpc_server_(rpc_server) { + VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); + auto it = rpc_call_map.find(distributed::kRequestSend); if (it != rpc_call_map.end()) { request_send_h_ = it->second; + send_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestSend))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + it = rpc_call_map.find(distributed::kRequestGet); if (it != rpc_call_map.end()) { request_get_h_ = it->second; + get_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGet))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch); + it = rpc_call_map.find(distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; + prefetch_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestCheckpoint); + if (it != rpc_call_map.end()) { + request_checkpoint_h_ = it->second; + checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); + if (it != rpc_call_map.end()) { + request_get_monomer_handler_h_ = it->second; + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); + if (it != rpc_call_map.end()) { + request_get_monomer_barrier_handler_h_ = it->second; } } virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VoidMessage* response, google::protobuf::Closure* done) override { + send_threads_->Run( + [=] { _SendVariable(cntl_butil, request, response, done); }); + } + + void _SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_send_h_ != nullptr, "RequestSend handler should be registed first!"); brpc::ClosureGuard done_guard(done); - - paddle::framework::Scope* local_scope = request_send_h_->scope(); - paddle::framework::Variable* outvar = nullptr; - paddle::framework::Variable* invar = nullptr; + brpc::Controller* cntl = static_cast(cntl_butil); std::string varname = request->varname(); + VLOG(3) << "RequestSend var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); - if (!request_send_h_->sync_mode()) { - local_scope = &request_send_h_->scope()->NewScope(); - invar = local_scope->Var(varname); - } else { - invar = local_scope->FindVar(varname); - } + distributed::BRPCVariableResponse resp(request_send_h_->scope(), + request_send_h_->dev_ctx(), + !request_send_h_->sync_mode()); + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); - request_send_h_->Handle(varname, local_scope, invar, &outvar); + auto scope = resp.GetMutableLocalScope(); + auto invar = resp.GetVar(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; - if (!request_send_h_->sync_mode()) { - request_send_h_->scope()->DeleteScope(local_scope); - } + request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); } void GetVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + get_threads_->Run( + [=] { _GetVariable(cntl_butil, request, response, done); }); + } + + void _GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_get_h_ != nullptr, "RequestGet handler should be registed first!"); - } + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGet varname:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + auto scope = request_get_h_->scope(); + auto invar = scope->FindVar(varname); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), + response, &cntl->response_attachment(), "", + false); + } + } void PrefetchVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + prefetch_threads_->Run( + [=] { _PrefetchVariable(cntl_butil, request, response, done); }); + } + + void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_prefetch_h_ != nullptr, "kRequestPrefetch handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // prefetch process... + std::string in_var_name = request->varname(); + std::string out_var_name = request->out_varname(); + VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name + << ", out_var_name: " << out_var_name + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + distributed::BRPCVariableResponse resp( + request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); + + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); + + auto scope = resp.GetMutableLocalScope(); + auto invar = scope->FindVar(in_var_name); + std::string table_name = request->table_name(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = scope->Var(out_var_name); + + request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, + out_var_name, table_name); + + distributed::SerializeToIOBuf(out_var_name, outvar, + *request_prefetch_h_->dev_ctx(), response, + &cntl->response_attachment(), "", true); + } + + void CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + checkpoint_notify_threads_->Run( + [=] { _CheckpointNotify(cntl_butil, request, response, done); }); + } + + void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE( + request_checkpoint_h_ != nullptr, + "kRequestCheckpointNotify handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), + request_checkpoint_h_->dev_ctx()); + + auto scope = resp.GetMutableLocalScope(); + + std::string checkpoint_notify = request->varname(); + std::string checkpoint_dir = request->out_varname(); + int trainer_id = request->trainer_id(); + + VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, + trainer_id, checkpoint_dir); + } + + void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_handler_h_ != nullptr, + "kRequestGetMonomerVariable handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // proc request. + std::string varname = request->varname(); + VLOG(3) << "GetMonomerVariable " << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, + request->trainer_id()); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, + &cntl->response_attachment(), "", false); + } + } + + void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_barrier_handler_h_ != nullptr, + "RequestGetMonomerBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + paddle::framework::Scope* scope = nullptr; + paddle::framework::Variable* invar = nullptr; + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_barrier_handler_h_->Handle( + varname, scope, invar, &outvar, request->trainer_id()); } private: - paddle::operators::distributed::RequestHandler* request_send_h_; - paddle::operators::distributed::RequestHandler* request_get_h_; - paddle::operators::distributed::RequestHandler* request_prefetch_h_; + distributed::RequestHandler* request_send_h_{nullptr}; + distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_prefetch_h_{nullptr}; + distributed::RequestHandler* request_checkpoint_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; + + distributed::RPCServer* rpc_server_{nullptr}; + + // FIXME(gongwb): brpc should support process one rpce use one threadpool. + std::unique_ptr send_threads_; + std::unique_ptr get_threads_; + std::unique_ptr prefetch_threads_; + std::unique_ptr checkpoint_notify_threads_; }; } // namespace sendrecv @@ -100,7 +303,7 @@ namespace distributed { void AsyncBRPCServer::StartServer() { // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); // Add the service into server. Notice the second parameter, because the // service is put on stack, we don't want server to delete it, otherwise @@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() { } brpc::ServerOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.idle_timeout_sec = idle_timeout_s_; options.max_concurrency = max_concurrency_; if (server_.Start(bind_address_.c_str(), &options) != 0) { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc_variable_response.cc new file mode 100644 index 0000000000..75306d7233 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +namespace pb = ::google::protobuf; +using vr = ::sendrecv::VariableMessage; + +int BRPCVariableResponse::Parse(Source* source) { + pb::io::ZeroCopyInputStream* input_stream = source->contents(); + pb::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (1) { + unsigned int tag = 0; + if (!input.ReadLittleEndian32(&tag)) { + break; + } + + uint64_t num_bytes = 0; + if (!input.ReadLittleEndian64(&num_bytes)) { + break; + } + + int field = static_cast(tag); + int ret = field == 0 ? -1 : field; + switch (field) { + case vr::kSerializedFieldNumber: { + if (!ProcSerializedField(field, &input, num_bytes)) { + return ret; + } + break; + } + case vr::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { + return ret; + } + break; + } + default: { + PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field); + return ret; + } + } + } + + return 0; +} +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc_variable_response.h new file mode 100644 index 0000000000..b0b91a42a0 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class BRPCSourceWrapper : public Source { + public: + explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + return &source_; + } + + private: + butil::IOBufAsZeroCopyInputStream source_; +}; + +class BRPCVariableResponse : public VariableResponse { + public: + BRPCVariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + bool create_scope = false) + : VariableResponse(scope, dev_ctx, create_scope) {} + + virtual ~BRPCVariableResponse() {} + + // parse attachment from iobuf + int Parse(Source* source) override; + int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { + BRPCSourceWrapper wrapper(iobuf); + return VariableResponse::Parse(&wrapper, meta); + } +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index f14dfcdb23..78956c9ea4 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); const std::string method = "SendMonomerFetchBarrierRPC"; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); s->Prepare(h, time_out); VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 31fac2133c..1f797ea91d 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -32,13 +32,6 @@ namespace paddle { namespace operators { namespace distributed { -static void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_name, diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index 45d1d3479c..8c7b7f1d7e 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -75,6 +75,10 @@ class RPCServer { void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, int thread_num = 5); + int GetThreadNum(const std::string& rpc_name) { + return rpc_thread_num_[rpc_name]; + } + // Wait util all the clients have reached the barrier for one // rpc method. This function should be called in the // RequestHandler if you want to run the server/client in a diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 6ba883ba01..5aadbcf220 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/port.h" @@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor( memory::Copy(cuda_pinned, result->ptr(), boost::get(tensor.place()), tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); return TensorPayload(result); #else diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 523e56fe3e..1a32ffdbec 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -50,6 +50,13 @@ class TensorPayload final { size_t memory_size_; }; +inline void SerializeDestroyCallback(void* payload) { + if (payload != nullptr) { + auto* shared_payload = reinterpret_cast(payload); + delete shared_payload; + } +} + TensorPayload GetTensorPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request); diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 28bb90af56..3c0b7ff24f 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index ab92ad4506..20870ea07e 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -26,10 +26,11 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" +#include "paddle/fluid/platform/profiler.h" -DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send"); -DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get"); -DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch"); +DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); +DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get"); +DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch"); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 58a3ca8272..0bf4bebbc9 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -58,7 +58,9 @@ class SendOp : public framework::OperatorBase { } if (sync_send) { for (size_t i = 0; i < rets.size(); i++) { + VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; } } } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 74b4f2e937..d590c3a3c6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -81,6 +81,14 @@ bool IsCompiledWithCUDA() { #endif } +bool IsCompiledWithBrpc() { +#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA) + return true; +#else + return false; +#endif +} + bool IsCompiledWithDIST() { #ifdef PADDLE_WITH_DISTRIBUTE return true; @@ -631,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. [](bool init_p2p) { framework::InitDevices(init_p2p); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); #ifdef PADDLE_WITH_CUDA m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e0bb0d1152..2dea71d7af 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -152,6 +152,7 @@ def __bootstrap__(): 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'selected_gpus' ] + core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) From 96216052d5ab74c367f439555da982be43f5b3ba Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 14 Dec 2018 12:24:02 +0000 Subject: [PATCH 349/719] 1. fix trt multi thread bug --- .../inference/tensorrt/convert/op_converter.h | 2 + .../fluid/operators/tensorrt/CMakeLists.txt | 2 +- .../operators/tensorrt/tensorrt_engine_op.cc | 4 +- .../tensorrt/tensorrt_engine_op.cu.cc | 24 --- .../operators/tensorrt/tensorrt_engine_op.h | 172 ++++++++---------- .../tensorrt/tensorrt_engine_op_test.cc | 2 - 6 files changed, 81 insertions(+), 125 deletions(-) delete mode 100644 paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index d61d635ed7..91670ba8ac 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -103,6 +103,7 @@ class OpConverter { void ConvertBlock(const framework::proto::BlockDesc& block, const std::unordered_set& parameters, const framework::Scope& scope, TensorRTEngine* engine) { + std::unique_lock lk(mut_); for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); ConvertOp(op, parameters, scope, engine); @@ -125,6 +126,7 @@ class OpConverter { std::unordered_map converters_; // fluid inference scope framework::Scope* scope_{nullptr}; + std::mutex mut_; }; } // namespace tensorrt diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt index eee0b90fba..6b551d13f1 100644 --- a/paddle/fluid/operators/tensorrt/CMakeLists.txt +++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt @@ -1,5 +1,5 @@ op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) -file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") +file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n") nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc DEPS tensorrt_engine_op analysis) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 3cf2ce3c7e..f1ab59e397 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -21,8 +21,6 @@ namespace paddle { -DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); - namespace operators { class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { @@ -50,6 +48,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc deleted file mode 100644 index cbe1b426f6..0000000000 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - tensorrt_engine, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 6eef4c98c4..c19c315f79 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -27,8 +27,6 @@ namespace paddle { -DECLARE_int32(tensorrt_engine_batch_size); - namespace operators { using FluidDT = framework::proto::VarType_Type; @@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kINT32; } -nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { +nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { PADDLE_ENFORCE_GT(shape.size(), 1UL, "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, @@ -63,131 +61,121 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { } // namespace // NOLINT using inference::Singleton; -using inference::tensorrt::TRT_EngineManager; +using inference::tensorrt::TensorRTEngine; + +class TensorRTEngineOp : public framework::OperatorBase { + private: + std::string engine_name_; + std::vector input_names_; + std::unordered_set param_names_; + mutable std::unique_ptr trt_engine_; + int max_batch_size_; + int workspace_size_; -class TensorRTEngineOp : public framework::OperatorWithKernel { public: - using framework::OperatorWithKernel::OperatorWithKernel; + TensorRTEngineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) { + engine_name_ = Attr("engine_uniq_key"); + input_names_ = Inputs("Xs"); + max_batch_size_ = Attr("max_batch_size"); + workspace_size_ = Attr("workspace_size"); + + auto params = Attr>("parameters"); + for (const auto ¶m : params) { + param_names_.insert(param); + } + } protected: - void InferShape(framework::InferShapeContext* ctx) const override {} - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input0 = ctx.Inputs("Xs").front(); - framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType(ctx.scope() - .FindVar(input0) - ->GetMutable() - ->type()), - ctx.GetPlace()); - return kt; + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunTrt(scope, dev_place); } -}; -template -class TensorRTEngineKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto engine_name = context.Attr("engine_uniq_key"); - int max_batch_size = context.Attr("max_batch_size"); - if (!Singleton::Global().HasEngine(engine_name)) { - Prepare(context); + void RunTrt(const framework::Scope &scope, + const platform::Place &dev_place) const { + int runtime_batch = 1; + if (trt_engine_.get() == nullptr) { + trt_engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, nullptr, + boost::get(dev_place).device)); + Prepare(scope, dev_place, trt_engine_.get()); } - auto* engine = Singleton::Global().Get(engine_name); - auto input_names = context.op().Inputs("Xs"); - PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size); + + auto *engine = trt_engine_.get(); + PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = - context.Attr>("output_name_mapping"); + Attr>("output_name_mapping"); - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } // Convert input tensor from fluid to engine. - for (const auto& x : context.Inputs("Xs")) { - if (parameters.count(x)) continue; + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer - auto& t = inference::analysis::GetFromScope( - context.scope(), x); + auto &t = + inference::analysis::GetFromScope(scope, x); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), + engine->SetInputFromCPU(x, static_cast(t.data()), t.memory_size()); } else { - engine->SetInputFromGPU(x, static_cast(t.data()), + engine->SetInputFromGPU(x, static_cast(t.data()), t.memory_size()); } } + + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); // Execute the engine. - PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); - engine->Execute(FLAGS_tensorrt_engine_batch_size); + engine->Execute(runtime_batch); // Convert output tensor from engine to fluid int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; - for (const auto& y : context.Outputs("Ys")) { + for (const auto &y : Outputs("Ys")) { VLOG(4) << y; // convert output and copy to fluid. - nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); + nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; - ddim.push_back(FLAGS_tensorrt_engine_batch_size); + ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto* fluid_v = context.scope().FindVar(y); + auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); - auto* fluid_t = fluid_v->GetMutable(); + auto *fluid_t = fluid_v->GetMutable(); fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) find some way to determine which device to output the - // tensor. - // if (platform::is_cpu_place(fluid_t->place())) { // TODO(Superjomn) change this float to dtype size. - auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * - FLAGS_tensorrt_engine_batch_size; + auto size = + inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; engine->GetOutputInGPU( output_maps[output_index], fluid_t->mutable_data(platform::CUDAPlace( - boost::get(context.GetPlace()).device)), + boost::get(dev_place).device)), size * sizeof(float)); - output_index += 1; } cudaStreamSynchronize(*engine->stream()); } - protected: - void Prepare(const framework::ExecutionContext& context) const { + void Prepare(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { VLOG(4) << "Prepare engine"; - // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch_size = context.Attr("max_batch_size"); - int workspace_size = context.Attr("workspace_size"); - - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } + block_desc.ParseFromString(Attr("subgraph")); std::vector output_maps = - context.Attr>("output_name_mapping"); - - // TODO(Superjomn) replace this with a different stream - auto* engine = Singleton::Global().Create( - max_batch_size, workspace_size, nullptr /*engine hold its own stream*/, - context.Attr("engine_uniq_key"), - boost::get(context.GetPlace()).device); + Attr>("output_name_mapping"); engine->InitNetwork(); @@ -195,39 +183,33 @@ class TensorRTEngineKernel : public framework::OpKernel { VLOG(4) << "parsed var size " << block.AllVars().size(); // Add inputs VLOG(4) << "declare inputs"; - for (auto& input : context.Inputs("Xs")) { - if (parameters.count(input)) continue; + for (auto &input : Inputs("Xs")) { + if (param_names_.count(input)) continue; VLOG(4) << "declare input " << input; - auto* var = block.FindVar(input); + + auto &t = + inference::analysis::GetFromScope(scope, input); + auto t_shape = framework::vectorize(t.dims()); + + auto *var = block.FindVar(input); // TensorRT engine need to create parameters. The parameter's description // should be set in PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, "TensorRT engine only takes LoDTensor as input"); - auto shape = var->GetShape(); - // For the special batch_size placeholder -1, drop it and pass the real - // shape of data. - // TODO(Superjomn) fix this with batch broadcast, or it can't handle - // variational batch size. - if (shape[0] == -1) { - shape[0] = FLAGS_tensorrt_engine_batch_size; - } + engine->DeclareInput( input, FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(shape)); + Vec2TRT_Dims(t_shape)); } - inference::Singleton::Global() - .ConvertBlock(block_desc, parameters, context.scope(), engine); + .ConvertBlock(block_desc, param_names_, scope, engine); // Add outputs - for (auto& output : output_maps) { - if (!engine->HasDeclared(output)) { - engine->DeclareOutput(output); - } + for (auto &output : output_maps) { + engine->DeclareOutput(output); } - engine->FreezeNetwork(); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 56bdd6c2f2..6f8adb00ed 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -24,8 +24,6 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_CUDA_ONLY_OP(tensorrt_engine); - namespace paddle { namespace operators { From 80766bcb829cab075ea7c1746d2493f6fc81b422 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 11:51:33 +0000 Subject: [PATCH 350/719] enable act jitcode vexp, vrelu, vsigmoid and vtanh --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 5 + paddle/fluid/operators/jit/gen/act.cc | 134 ++++++++ paddle/fluid/operators/jit/gen/act.h | 312 ++++++++++++++++++ 3 files changed, 451 insertions(+) create mode 100644 paddle/fluid/operators/jit/gen/act.cc create mode 100644 paddle/fluid/operators/jit/gen/act.h diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index ef74a7118b..2be750a4d8 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -15,3 +15,8 @@ USE_JITKERNEL_GEN(vadd) USE_JITKERNEL_GEN(vaddrelu) USE_JITKERNEL_GEN(vscal) USE_JITKERNEL_GEN(vaddbias) +USE_JITKERNEL_GEN(vrelu) +USE_JITKERNEL_GEN(videntity) +USE_JITKERNEL_GEN(vexp) +USE_JITKERNEL_GEN(vsigmoid) +USE_JITKERNEL_GEN(vtanh) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc new file mode 100644 index 0000000000..f3332cbefa --- /dev/null +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/act.h" +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f), + REPEAT_8TIMES(2.f), + REPEAT_8TIMES(0.5f), + REPEAT_8TIMES(EXP_HIG), + REPEAT_8TIMES(EXP_LOW), + REPEAT_8TIMES(CEPHES_LOG2EF), + REPEAT_8TIMES(CEPHES_EXP_C1), + REPEAT_8TIMES(CEPHES_EXP_C2), + REPEAT_8TIMES(CEPHES_EXP_P0), + REPEAT_8TIMES(CEPHES_EXP_P1), + REPEAT_8TIMES(CEPHES_EXP_P2), + REPEAT_8TIMES(CEPHES_EXP_P3), + REPEAT_8TIMES(CEPHES_EXP_P4), + REPEAT_8TIMES(CEPHES_EXP_P5), + REPEAT_8TIMES(EXP_MAX_INPUT), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; + +const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; +int g_tmp_mem[16] ALIGN32 = {0}; + +void VActJitCode::genCode() { + int offset = 0; + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + vmovups(ymm_src, ptr[param1 + offset]); + act(ymm_dst, ymm_src, type_); + vmovups(ptr[param2 + offset], ymm_dst); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + int rest = num_ % YMM_FLOAT_BLOCK; + while (rest > 0) { + int block = XMM_FLOAT_BLOCK; + if (rest >= 4) { + block = 4; + vmovups(xmm_src, ptr[param1 + offset]); + } else if (rest >= 2) { + block = 2; + vmovq(xmm_src, ptr[param1 + offset]); + } else { + block = 1; + vmovss(xmm_src, ptr[param1 + offset]); + } + act(xmm_dst, xmm_src, type_); + if (rest >= 4) { + vmovups(ptr[param2 + offset], xmm_dst); + } else if (rest >= 2) { + vmovq(ptr[param2 + offset], xmm_dst); + } else { + vmovss(ptr[param2 + offset], xmm_dst); + } + offset += sizeof(float) * block; + rest -= block; + } + ret(); +} + +#define DECLARE_ACT_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + bool UseMe(const int& attr) const override { \ + return platform::MayIUse(platform::avx); \ + } \ + size_t CodeSize(const int& d) const override; \ + std::unique_ptr CreateJitCode(const int& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_ACT_CREATOR(VRelu); +DECLARE_ACT_CREATOR(VIdentity); +DECLARE_ACT_CREATOR(VExp); +DECLARE_ACT_CREATOR(VSigmoid); +DECLARE_ACT_CREATOR(VTanh); + +// TODO(TJ): tuning use me +size_t VReluCreator::CodeSize(const int& d) const { + return 96 /* init size */ + + (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * + 8 /* average bytes for each instruction */; +} + +size_t VIdentityCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8; +} + +size_t VExpCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8; +} + +size_t VSigmoidCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8; +} + +size_t VTanhCreator::CodeSize(const int& d) const { + return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8; +} + +#undef DECLARE_ACT_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(vrelu, gen::VReluCreator); +REGISTER_JITKERNEL_GEN(videntity, gen::VIdentityCreator); +REGISTER_JITKERNEL_GEN(vexp, gen::VExpCreator); +REGISTER_JITKERNEL_GEN(vsigmoid, gen::VSigmoidCreator); +REGISTER_JITKERNEL_GEN(vtanh, gen::VTanhCreator); diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h new file mode 100644 index 0000000000..63dee7bc0d --- /dev/null +++ b/paddle/fluid/operators/jit/gen/act.h @@ -0,0 +1,312 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +extern const float exp_float_consts[]; +extern const int exp_int_0x7f[]; +extern int g_tmp_mem[]; + +#define ALIGN32 __attribute__((aligned(32))) +#define EXP_HIG 88.3762626647949f +#define EXP_LOW -88.3762626647949f +#define CEPHES_LOG2EF 1.44269504088896341 +#define CEPHES_EXP_C1 0.693359375 +#define CEPHES_EXP_C2 -2.12194440e-4 +#define CEPHES_EXP_P0 1.9875691500E-4 +#define CEPHES_EXP_P1 1.3981999507E-3 +#define CEPHES_EXP_P2 8.3334519073E-3 +#define CEPHES_EXP_P3 4.1665795894E-2 +#define CEPHES_EXP_P4 1.6666665459E-1 +#define CEPHES_EXP_P5 5.0000001201E-1 + +#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val + +#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float) + +class VActJitCode : public JitCode { + public: + explicit VActJitCode(int d, operand_type type, size_t code_size, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), num_(d), type_(type) { + if (!(type_ == operand_type::relu || type_ == operand_type::exp || + type_ == operand_type::sigmoid || type_ == operand_type::tanh || + type_ == operand_type::identity)) { + LOG(FATAL) << "Do not support this operand type: " << type_; + } + this->genCode(); + } + + const char* name() const override { + std::string base = "VActJitCode"; + switch (type_) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + return base.c_str(); + } + void genCode() override; + + protected: + // compute relu with ymm, xmm + template + void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) { // NOLINT + JMM zero = JMM(zero_idx); + vxorps(zero, zero, zero); + vmaxps(dst, src, zero); + } + + // compute exp with ymm, xmm + template + void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT + int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { + using namespace platform; // NOLINT + // check all idx can not equal + JMM jmm_src = JMM(src_idx); + JMM jmm_fx = JMM(fx_idx); + JMM jmm_fy = JMM(fy_idx); + JMM jmm_mask = JMM(mask_idx); + JMM jmm_tmp = JMM(tmp_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + vmovaps(jmm_src, src); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); + vminps(jmm_src, jmm_src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); + vmaxps(jmm_src, jmm_src, jmm_tmp); + // express exp(x) as exp(g + n*log(2)) + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); + vmulps(jmm_fx, jmm_src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); + vaddps(jmm_fx, jmm_fx, jmm_tmp); + vroundps(jmm_fy, jmm_fx, 0x01); + // if greater, substract 1 + vcmpgtps(jmm_mask, jmm_fy, jmm_fx); + vmovaps(jmm_tmp, ptr[reg_ptr_global]); + vandps(jmm_mask, jmm_mask, jmm_tmp); + vsubps(jmm_fx, jmm_fy, jmm_mask); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); + vmulps(jmm_fy, jmm_fx, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); + JMM ymm_z = JMM(jmm_mask.getIdx()); + vmulps(ymm_z, jmm_fx, jmm_tmp); + vsubps(jmm_src, jmm_src, jmm_fy); + vsubps(jmm_src, jmm_src, ymm_z); + vmulps(ymm_z, jmm_src, jmm_src); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); + vmulps(dst, jmm_src, jmm_tmp); + for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; + i += (YMM_FLOAT_BLOCK * sizeof(float))) { + vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4 + vaddps(dst, dst, jmm_tmp); + vmulps(dst, dst, jmm_src); + } + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); + vaddps(dst, dst, jmm_tmp); + vmulps(dst, dst, ymm_z); + vaddps(dst, dst, jmm_src); + vmovaps(jmm_tmp, ptr[reg_ptr_global]); + vaddps(dst, dst, jmm_tmp); + // build 2^n + JMM ymm_int = jmm_fx; + vcvttps2dq(ymm_int, jmm_fx); + mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); + vmovdqa(jmm_tmp, ptr[reg_ptr_global]); + if (MayIUse(avx2) || std::is_same::value) { + vpaddd(ymm_int, ymm_int, jmm_tmp); + vpslld(ymm_int, ymm_int, 23); + } else if (MayIUse(avx)) { + xmm_t xtmp1 = xmm_t(ymm_int.getIdx()); + xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx()); + reg64_t reg_ptr_tmp = reg_ptr_global; + mov(reg_ptr_tmp, reinterpret_cast(g_tmp_mem)); + vmovdqa(ptr[reg_ptr_tmp], ymm_int); + vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp], xtmp1); + // next 128bits + vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]); + vmovdqa(xtmp2, ptr[reg_ptr_tmp + + (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1); + // load out + vmovdqa(ymm_int, ptr[reg_ptr_tmp]); + } + vmulps(dst, dst, ymm_int); + pop(reg_ptr_global); + } + + // compute sigmoid with ymm, xmm + template + void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { + // y = 1 / (1 + e^-x) + JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_src = JMM(src_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + vmovaps(jmm_src, src); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); + vminps(jmm_src, jmm_src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); + vmaxps(jmm_src, jmm_src, jmm_tmp); + vxorps(jmm_tmp, jmm_tmp, jmm_tmp); + vsubps(jmm_src, jmm_tmp, jmm_src); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(dst, dst, jmm_tmp); + vdivps(dst, jmm_tmp, dst); + pop(reg_ptr_global); + } + + // compute tanh with ymm, xmm + template + void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { + // y = 2 / (1 + e^(-2x)) - 1 + JMM jmm_src = JMM(src_idx); + JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_zero = JMM(mask_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + vmovaps(jmm_src, src); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); + vxorps(jmm_zero, jmm_zero, jmm_zero); + vsubps(jmm_tmp, jmm_zero, jmm_tmp); + vmulps(jmm_src, jmm_src, jmm_tmp); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(dst, dst, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); + vdivps(dst, jmm_tmp, dst); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vsubps(dst, dst, jmm_tmp); + pop(reg_ptr_global); + } + + // compute identity with ymm, xmm + template + void identity_jmm(JMM& dst, JMM& src, int zero_idx) { // NOLINT + JMM zero = JMM(zero_idx); + vxorps(zero, zero, zero); + vaddps(dst, src, zero); + // TODO(TJ): use below + // dst.setIdx(src.getIdx()); + } + + template + void act(JMM& dst, JMM& src, operand_type type) { // NOLINT + // use 11~15 + switch (type) { + case operand_type::relu: + relu_jmm(dst, src, 15); + break; + case operand_type::exp: + exp_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::sigmoid: + sigmoid_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::tanh: + tanh_jmm(dst, src, 11, 12, 13, 14, 15); + break; + case operand_type::identity: + identity_jmm(dst, src, 15); + break; + default: + LOG(FATAL) << "Do not support this operand type: " << type_; + break; + } + } + + protected: + int num_; + operand_type type_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + + xmm_t xmm_src = xmm_t(0); + ymm_t ymm_src = ymm_t(0); + + xmm_t xmm_dst = xmm_t(1); + ymm_t ymm_dst = ymm_t(1); +}; + +#define DECLARE_ACT_JITCODE(name, op_type) \ + class name##JitCode : public VActJitCode { \ + public: \ + explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ + : VActJitCode(d, op_type, code_size, code_ptr) {} \ + }; + +DECLARE_ACT_JITCODE(VRelu, operand_type::relu); +DECLARE_ACT_JITCODE(VIdentity, operand_type::identity); +DECLARE_ACT_JITCODE(VExp, operand_type::exp); +DECLARE_ACT_JITCODE(VSigmoid, operand_type::sigmoid); +DECLARE_ACT_JITCODE(VTanh, operand_type::tanh); + +#undef DECLARE_ACT_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle From 514648665ae7093f4531579246e5fec7c1849f07 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Sat, 15 Dec 2018 08:01:27 +0000 Subject: [PATCH 351/719] fix trt_op test test=develop --- paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 6f8adb00ed..287b0edc96 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" +USE_NO_KERNEL_OP(tensorrt_engine); namespace paddle { namespace operators { From 2ebf12f340a82e1512f5f889d37b41e76b9eb3f7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 09:52:58 +0800 Subject: [PATCH 352/719] fix test=develop --- paddle/scripts/paddle_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0fc43f33d0..a0da89d319 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -518,7 +518,7 @@ function assert_api_spec_approvals() { fi done - HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast || true` + HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true` if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` From 4e4a777243fe023f08959602b702bfec1babf469 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Sun, 16 Dec 2018 06:32:46 +0000 Subject: [PATCH 353/719] add conv+elementwiseadd pass test=develop --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/graph_pattern_detector.cc | 28 ++++++++++++++++++- .../framework/ir/graph_pattern_detector.h | 18 ++++++++++++ .../fluid/inference/api/paddle_pass_builder.h | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index be4151b54b..b7f7e2ee8e 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -44,6 +44,7 @@ pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) +pass_library(conv_elementwise_add_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index bf12d12459..13d752e516 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -17,7 +17,6 @@ #include #include -#include "graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" @@ -1210,6 +1209,33 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { return act_out; } +PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + + return elementwise_add_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 0fee2f1c18..eaedd9d08e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -716,6 +716,24 @@ struct ConvElementwiseadd2Act : public PatternBase { PATTERN_DECL_NODE(act_out); }; +// Conv + ElementwiseAdd +// This pattern should be used after ConvElementwiseadd2Act or +// ConvElementwiseadd pass +struct ConvElementwiseadd : public PatternBase { + ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); + PATTERN_DECL_NODE(elementwise_add_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index e6e7de2478..40ca0d287c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -122,6 +122,7 @@ class GpuPassStrategy : public PassStrategy { "conv_bn_fuse_pass", // "conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); } From f2b92d77b59f6bcb55f33ee69d640b9b9b77c348 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:02:44 +0800 Subject: [PATCH 354/719] remove clock time in WIN32 mode --- paddle/fluid/framework/async_executor.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index a82e941559..95c8472b2f 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -34,9 +34,13 @@ namespace paddle { namespace framework { inline double current_realtime() { +#if !defined(_WIN32) struct timespec tp; clock_gettime(CLOCK_REALTIME, &tp); return tp.tv_sec + tp.tv_nsec * 1e-9; +#else + return 0.0; +#endif } inline std::default_random_engine& local_random_engine() { From 66522046ad9c8659b80dc3be6d0c50c3d56f17fa Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:02:44 +0800 Subject: [PATCH 355/719] remove clock time in WIN32 mode test=develop --- paddle/fluid/framework/async_executor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 95c8472b2f..7accc4cb57 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -39,7 +39,7 @@ inline double current_realtime() { clock_gettime(CLOCK_REALTIME, &tp); return tp.tv_sec + tp.tv_nsec * 1e-9; #else - return 0.0; + return 0; #endif } From 4c0a769d1d70f2d4f86f3369d65eb2fb6bd6981f Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:16:14 +0800 Subject: [PATCH 356/719] avoid clock time in WIN32 mode test=develop --- paddle/fluid/framework/async_executor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 7accc4cb57..95c8472b2f 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -39,7 +39,7 @@ inline double current_realtime() { clock_gettime(CLOCK_REALTIME, &tp); return tp.tv_sec + tp.tv_nsec * 1e-9; #else - return 0; + return 0.0; #endif } From c0c9fcd9c721fdadfb8412e3e47d207bf5e4766e Mon Sep 17 00:00:00 2001 From: nhzlx Date: Sun, 16 Dec 2018 13:02:33 +0000 Subject: [PATCH 357/719] add source file test=develop --- .../ir/conv_elementwise_add_fuse_pass.cc | 91 +++++++++++++++++++ .../ir/conv_elementwise_add_fuse_pass.h | 33 +++++++ 2 files changed, 124 insertions(+) create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc new file mode 100644 index 0000000000..476c9dbc35 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); + +std::unique_ptr ConvElementwiseAddFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string output_name = elementwise_add_out->Name(); + + std::string act_type = "identity"; + framework::OpDesc new_op_desc(base_op_desc, nullptr); + new_op_desc.SetType("conv2d_fusion"); + new_op_desc.SetInput("Bias", {bias_name}); + new_op_desc.SetInput("ResidualData", {}); + new_op_desc.SetAttr("activation", act_type); + new_op_desc.SetOutput("Output", {output_name}); + new_op_desc.SetAttr("is_test", true); + new_op_desc.SetAttr("use_cudnn", false); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_fuse_pass, + paddle::framework::ir::ConvElementwiseAddFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h new file mode 100644 index 0000000000..f234603f58 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle From 6445cf1e91f6e9ac169f6834d4b3471136d9bd38 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 21:22:03 +0800 Subject: [PATCH 358/719] fix test=develop --- python/paddle/fluid/tests/book/test_recognize_digits.py | 2 +- python/paddle/fluid/tests/book/test_word2vec.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 54936519ce..3b2c4af8ae 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -260,7 +260,7 @@ def inject_all_tests(): for use_cuda in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - for parallel in (False, True): + for parallel in (False, ): for nn_type in ('mlp', 'conv'): inject_test_method(use_cuda, parallel, nn_type, True) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 08f70c9cab..e24a9aa989 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -250,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): for use_cuda in (False, True): for is_sparse in (False, True): - for is_parallel in (False, True): + for is_parallel in (False, ): inject_test_method(use_cuda, is_sparse, is_parallel) if __name__ == '__main__': From 8a6b53a4943f671b456a2ab20d85e62c83f56ed6 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 16 Dec 2018 20:16:14 +0800 Subject: [PATCH 359/719] avoid clock time in WIN32 mode test=develop --- python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/requirements.txt b/python/requirements.txt index 5d64674fe0..36313333b2 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -9,4 +9,4 @@ Pillow nltk>=3.2.2 graphviz six -mpi4py=3.0.0 +mpi4py==3.0.0 From 7c1f3ad6eb18d40310e8933a937ef83b3342a532 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 14 Dec 2018 14:13:44 +0000 Subject: [PATCH 360/719] enable jitcode lstm --- paddle/fluid/operators/jit/README.md | 2 +- paddle/fluid/operators/jit/gen/CMakeLists.txt | 2 + paddle/fluid/operators/jit/gen/jitcode.h | 2 +- paddle/fluid/operators/jit/gen/lstm.cc | 142 ++++++++++++++++++ paddle/fluid/operators/jit/gen/lstm.h | 119 +++++++++++++++ paddle/fluid/operators/jit/test.cc | 10 +- 6 files changed, 268 insertions(+), 9 deletions(-) create mode 100644 paddle/fluid/operators/jit/gen/lstm.cc create mode 100644 paddle/fluid/operators/jit/gen/lstm.h diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index 28d21f40af..ce31f18b63 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -46,7 +46,7 @@ PaddlePaddle/Paddle/paddle/fluid/ - 在`KernelType` 中添加 `your_key` . - 实现Reference 的逻辑,每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`. - (optional) 实现更多的算法在`more`目录下,可以依赖mkl,openblas,或者mkldnn等第三方库。 -- (optional) 实现基于Xbyak的生成code,在`gen`目下。 +- (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在KernelType上。 - 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 - 添加unit test,需要测试float和double - 添加benchmark确保get得到的速度是最快。 diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 2be750a4d8..81a6314bd2 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -20,3 +20,5 @@ USE_JITKERNEL_GEN(videntity) USE_JITKERNEL_GEN(vexp) USE_JITKERNEL_GEN(vsigmoid) USE_JITKERNEL_GEN(vtanh) +USE_JITKERNEL_GEN(lstmctht) +USE_JITKERNEL_GEN(lstmc1h1) diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 64126e3f61..898d7df345 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -62,7 +62,7 @@ typedef enum { class JitCode : public GenBase, public Xbyak::CodeGenerator { public: explicit JitCode(size_t code_size, void* code_ptr = nullptr) - : Xbyak::CodeGenerator(code_size, code_ptr) {} + : Xbyak::CodeGenerator((code_size < 4096 ? 4096 : code_size), code_ptr) {} virtual const char* name() const = 0; virtual void genCode() = 0; diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc new file mode 100644 index 0000000000..7e5a7773f8 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/lstm.h" +#include // offsetof +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void LSTMJitCode::genCode() { + if (use_peephole_) { + preCode(); + } + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ct_1 = r9; + reg64_t reg_ptr_ct = r10; + reg64_t reg_ptr_ht = r11; + reg64_t reg_ptr_wp = r12; + mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); + mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); + mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + if (use_peephole_) { + mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]); + } + + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + /* gates: W_ch, W_ih, W_fh, W_oh */ + ymm_t ymm_c = ymm_t(0); + ymm_t ymm_i = ymm_t(1); + ymm_t ymm_f = ymm_t(2); + ymm_t ymm_o = ymm_t(3); + ymm_t ymm_ct_1 = ymm_t(4); + ymm_t ymm_wp0 = ymm_t(5); + ymm_t ymm_wp1 = ymm_t(6); + ymm_t ymm_wp2 = ymm_t(7); + vmovups(ymm_c, ptr[reg_ptr_gates + offset]); + vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]); + vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]); + vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]); + if (!compute_c1h1_) { + vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); + } + if (use_peephole_) { + vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]); + vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]); + vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]); + } + /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */ + // act_cand(c) + act(ymm_c, ymm_c, act_cand_); + // act_gate(i) or act_gate(ct_1 * wp0 + i) + if (!compute_c1h1_ && use_peephole_) { + vmulps(ymm_wp0, ymm_ct_1, ymm_wp0); + vaddps(ymm_i, ymm_i, ymm_wp0); + } + act(ymm_i, ymm_i, act_gate_); + vmulps(ymm_c, ymm_c, ymm_i); + if (!compute_c1h1_) { + // act_gate(f) or act_gate(ct_1 * wp1 + f) + if (use_peephole_) { + vmulps(ymm_wp1, ymm_ct_1, ymm_wp1); + vaddps(ymm_f, ymm_f, ymm_wp1); + } + act(ymm_f, ymm_f, act_gate_); + // ct + vmulps(ymm_f, ymm_f, ymm_ct_1); + vaddps(ymm_f, ymm_f, ymm_c); + } + /* H_t = act_cell(C_t) * act_gate(o) */ + // act_cell(C_t) + ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; + ymm_t ymm_tmp = ymm_i; + act(ymm_tmp, ymm_ct, act_cell_); + // act_gate(o) or act_gate(ct * wp2 + o) + if (use_peephole_) { + vmulps(ymm_wp2, ymm_ct, ymm_wp2); + vaddps(ymm_o, ymm_o, ymm_wp2); + } + act(ymm_o, ymm_o, act_gate_); + // ht + vmulps(ymm_o, ymm_o, ymm_tmp); + // save ct and ht + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); + vmovups(ptr[reg_ptr_ht + offset], ymm_o); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + + if (use_peephole_) { + postCode(); + } else { + ret(); + } +} + +#define DECLARE_LSTM_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + /* TODO(TJ): enable more */ \ + bool UseMe(const lstm_attr_t& attr) const override { \ + return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ + } \ + size_t CodeSize(const lstm_attr_t& attr) const override { \ + return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \ + } \ + std::unique_ptr CreateJitCode( \ + const lstm_attr_t& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_LSTM_CREATOR(LSTMCtHt); +DECLARE_LSTM_CREATOR(LSTMC1H1); + +#undef DECLARE_LSTM_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(lstmctht, gen::LSTMCtHtCreator); +REGISTER_JITKERNEL_GEN(lstmc1h1, gen::LSTMC1H1Creator); diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h new file mode 100644 index 0000000000..cb8705c6d9 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/lstm.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/act.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class LSTMJitCode : public VActJitCode { + public: + explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr, + size_t code_size, void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + compute_c1h1_(compute_c1h1) { + auto typeExchange = [](KernelType type) -> gen::operand_type { + if (type == KernelType::vsigmoid) { + return operand_type::sigmoid; + } else if (type == KernelType::vrelu) { + return operand_type::relu; + } else if (type == KernelType::vtanh) { + return operand_type::tanh; + } else if (type == KernelType::videntity) { + return operand_type::identity; + } else { + LOG(FATAL) << "Do not support this jit::KernelType: " << type; + } + return operand_type::identity; + }; + num_ = attr.d; + use_peephole_ = attr.use_peephole; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + act_cell_ = typeExchange(attr.act_cell); + + this->genCode(); + } + + const char* name() const override { + std::string base = "LSTMJitCode"; + if (use_peephole_) { + base += "_Peephole"; + } + if (compute_c1h1_) { + base += "_C1H1"; + } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + AddTypeStr(act_cell_); + return base.c_str(); + } + void genCode() override; + + protected: + int num_; + bool compute_c1h1_; + bool use_peephole_; + operand_type act_gate_; + operand_type act_cand_; + operand_type act_cell_; + reg64_t param1{abi_param1}; +}; + +#define DECLARE_LSTM_JITCODE(name, compute_c1h1) \ + class name##JitCode : public LSTMJitCode { \ + public: \ + explicit name##JitCode(const lstm_attr_t& attr, size_t code_size, \ + void* code_ptr = nullptr) \ + : LSTMJitCode(compute_c1h1, attr, code_size, code_ptr) {} \ + }; + +DECLARE_LSTM_JITCODE(LSTMCtHt, false); +DECLARE_LSTM_JITCODE(LSTMC1H1, true); + +#undef DECLARE_LSTM_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index e211276d18..36f8eb6e7b 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -236,7 +236,7 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { } } // test result from Get function - VLOG(10) << "Test Get function "; + // VLOG(10) << "Test Get function "; auto tgt = jit::Get(attr); test(tgt, args...); } @@ -338,9 +338,6 @@ void TestLSTMKernel() { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { for (auto& act_cell : all_acts) { - std::string info = act_gate + act_cand + act_cell + - (use_peephole ? "peephole_" : "") + "size_" + - std::to_string(d); const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); @@ -370,7 +367,7 @@ void TestLSTMKernel() { step.checked = checked_data; } ref(&step, &attr); - + VLOG(10) << attr; TestAllImpls, PlaceType, std::vector, std::vector, std::vector, std::vector, std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, @@ -390,7 +387,6 @@ void TestGRUKernel() { for (int d : TestSizes()) { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { - std::string info = act_gate + act_cand + "size_" + std::to_string(d); const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); auto ref = jit::GetRefer>(); @@ -409,7 +405,7 @@ void TestGRUKernel() { step.ht_1 = ht_1_data; step.ht = ht_ref_data; ref(&step, &attr); - + VLOG(10) << attr; TestAllImpls, PlaceType, std::vector, std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, attr); From 3713d08d40840c1c9b3485a2cea3facce012d0d2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sun, 16 Dec 2018 15:34:20 +0000 Subject: [PATCH 361/719] enable jitcode gru --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 3 + paddle/fluid/operators/jit/gen/gru.cc | 116 ++++++++++++++++++ paddle/fluid/operators/jit/gen/gru.h | 116 ++++++++++++++++++ 3 files changed, 235 insertions(+) create mode 100644 paddle/fluid/operators/jit/gen/gru.cc create mode 100644 paddle/fluid/operators/jit/gen/gru.h diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 81a6314bd2..8ad9587b5e 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -22,3 +22,6 @@ USE_JITKERNEL_GEN(vsigmoid) USE_JITKERNEL_GEN(vtanh) USE_JITKERNEL_GEN(lstmctht) USE_JITKERNEL_GEN(lstmc1h1) +USE_JITKERNEL_GEN(gruh1) +USE_JITKERNEL_GEN(gruhtpart1) +USE_JITKERNEL_GEN(gruhtpart2) diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc new file mode 100644 index 0000000000..ec89880a0c --- /dev/null +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/gru.h" +#include // offsetof +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void GRUJitCode::genCode() { + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ht_1 = r9; + reg64_t reg_ptr_ht = r10; + mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]); + mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]); + ymm_t ymm_one = ymm_t(0); + + if (id_ == 2) { + reg64_t reg_ptr_tmp = r11; + mov(reg_ptr_tmp, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]); + } + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + ymm_t ymm_u = ymm_t(1); + ymm_t ymm_r = ymm_t(2); + ymm_t ymm_s = ymm_t(3); + ymm_t ymm_ht_1 = ymm_t(4); + // W: {W_update, W_reset; W_state} + if (id_ == 0 || id_ == 2) { + vmovups(ymm_u, ptr[reg_ptr_gates + offset]); + vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]); + } + if (id_ == 1) { + vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]); + } + if (id_ == 1 || id_ == 2) { + vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]); + } + + if (id_ == 0) { + // ht = act_gate(u) * act_cand(s) + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_s); + } else if (id_ == 1) { + // ht = act_gate(r) * ht_1 + act(ymm_r, ymm_r, act_gate_); + vmulps(ymm_r, ymm_r, ymm_ht_1); + vmovups(ptr[reg_ptr_ht + offset], ymm_r); + } else if (id_ == 2) { + // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 + ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx()); + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vsubps(ymm_u, ymm_one_inner, ymm_u); + vmulps(ymm_u, ymm_ht_1, ymm_u); + vaddps(ymm_u, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_u); + } + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + ret(); +} + +#define DECLARE_GRU_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + /* TODO(TJ): enable more */ \ + bool UseMe(const gru_attr_t& attr) const override { \ + return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ + } \ + size_t CodeSize(const gru_attr_t& attr) const override { \ + return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \ + } \ + std::unique_ptr CreateJitCode( \ + const gru_attr_t& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_GRU_CREATOR(GRUH1); +DECLARE_GRU_CREATOR(GRUHtPart1); +DECLARE_GRU_CREATOR(GRUHtPart2); + +#undef DECLARE_GRU_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(gruh1, gen::GRUH1Creator); +REGISTER_JITKERNEL_GEN(gruhtpart1, gen::GRUHtPart1Creator); +REGISTER_JITKERNEL_GEN(gruhtpart2, gen::GRUHtPart2Creator); diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h new file mode 100644 index 0000000000..bab1c6a4ee --- /dev/null +++ b/paddle/fluid/operators/jit/gen/gru.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/act.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class GRUJitCode : public VActJitCode { + public: + explicit GRUJitCode(int id, const gru_attr_t& attr, size_t code_size, + void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + id_(id) { + auto typeExchange = [](KernelType type) -> gen::operand_type { + if (type == KernelType::vsigmoid) { + return operand_type::sigmoid; + } else if (type == KernelType::vrelu) { + return operand_type::relu; + } else if (type == KernelType::vtanh) { + return operand_type::tanh; + } else if (type == KernelType::videntity) { + return operand_type::identity; + } else { + LOG(FATAL) << "Do not support this jit::KernelType: " << type; + } + return operand_type::identity; + }; + num_ = attr.d; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + + this->genCode(); + } + + const char* name() const override { + std::string base = "GRUJitCode"; + if (id_ == 0) { + base += "_H1"; + } else if (id_ == 1) { + base += "_HtPart1"; + } else if (id_ == 2) { + base += "_HtPart2"; + } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + return base.c_str(); + } + void genCode() override; + + protected: + int id_; + int num_; + operand_type act_gate_; + operand_type act_cand_; + reg64_t param1{abi_param1}; +}; + +#define DECLARE_GRU_JITCODE(name, id) \ + class name##JitCode : public GRUJitCode { \ + public: \ + explicit name##JitCode(const gru_attr_t& attr, size_t code_size, \ + void* code_ptr = nullptr) \ + : GRUJitCode(id, attr, code_size, code_ptr) {} \ + }; + +DECLARE_GRU_JITCODE(GRUH1, 0); +DECLARE_GRU_JITCODE(GRUHtPart1, 1); +DECLARE_GRU_JITCODE(GRUHtPart2, 2); + +#undef DECLARE_GRU_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle From 29c772663a7905b64fec66e452d77cd6b7ec9449 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 00:12:03 +0800 Subject: [PATCH 362/719] refine import path for ps_instance.py test=develop --- python/paddle/fluid/distributed/ps_instance.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index 6b44d0cd16..91f53102b6 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -11,8 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -import helper as dist_helper -import sys +from .helper import MPIHelper class PaddlePSInstance(object): @@ -26,7 +25,7 @@ class PaddlePSInstance(object): """ def __init__(self, server_worker_mode, proc_per_node): - self.dh = dist_helper.MPIHelper() + self.dh = MPIHelper() self._rankid = self.dh.get_rank() self._server_worker_mode = server_worker_mode self._proc_per_node = proc_per_node From 178c47c074ba0c1294dd8e0e8f38faa0a5e17ab3 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 00:12:03 +0800 Subject: [PATCH 363/719] refine import path for ps_instance.py test=develop --- python/paddle/fluid/distributed/helper.py | 7 ++++--- python/requirements.txt | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index ca6dd5dabf..999c8d77b8 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from mpi4py import MPI import ps_pb2 as pslib @@ -59,7 +58,7 @@ class FileSystem(object): class MPIHelper(object): """ - MPIHelper is a wrapper of mpi4py, supprot get_rank get_size etc. + MPIHelper is a wrapper of mpi4py, support get_rank get_size etc. Args: No params Examples: @@ -68,7 +67,9 @@ class MPIHelper(object): """ def __init__(self): + from mpi4py import MPI self.comm = MPI.COMM_WORLD + self.MPI = MPI def get_rank(self): return self.comm.Get_rank() @@ -86,4 +87,4 @@ class MPIHelper(object): return socket.gethostname() def finalize(self): - MPI.Finalize() + self.MPI.Finalize() diff --git a/python/requirements.txt b/python/requirements.txt index 36313333b2..2f81d85df0 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -9,4 +9,3 @@ Pillow nltk>=3.2.2 graphviz six -mpi4py==3.0.0 From 43028f655d44eb524fc988a1645b993cefd08e6a Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 00:12:03 +0800 Subject: [PATCH 364/719] refine import path for ps_instance.py test=develop --- python/paddle/fluid/distributed/helper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 999c8d77b8..cdde5403cd 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ps_pb2 as pslib - class FileSystem(object): """ @@ -37,6 +35,7 @@ class FileSystem(object): assert user != None assert passwd != None assert hadoop_bin != None + import ps_pb2 as pslib self.fs_client = pslib.FsClientParameter() #if fs_type == "afs": # fs_client.fs_type = pslib.FsApiType.AFS From 921b7f452a2a4bf26f3aa288365401c35719903f Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 10:24:46 +0800 Subject: [PATCH 365/719] add API.spec test=develop --- paddle/fluid/API.spec | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 26113ee7e9..e156945147 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -38,7 +38,15 @@ paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], va paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) +paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) From fcde2b2725566a9cde0c8930d2e80e6a044d6784 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 10:29:59 +0800 Subject: [PATCH 366/719] add ForRangeIn --- paddle/fluid/operators/optimizers/adam_op.h | 7 ++- paddle/fluid/platform/for_range.h | 55 +++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 5870557bb7..e8b977e2d9 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -359,14 +359,17 @@ class AdamOpKernel : public framework::OpKernel { param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); if (lazy_mode) { + std::vector id_vector; size_t row_count = grad_merge.rows().size(); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { size_t i = rows[row_index] * row_numel + offset; - T g = grad_data[row_index * row_numel + offset]; - functor.adam_update(i, g); + id_vector.push_back(i); } } + platform::ForRangeIn for_range_in( + static_cast(ctx.device_context()), id_vector); + for_range_in(functor); } else { platform::ForRange for_range( static_cast(ctx.device_context()), diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index c153e80fe4..9fbaa36723 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,11 +13,38 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include + +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { namespace platform { +template +struct ForRangeIn { + ForRangeIn(const DeviceContext& dev_ctx, std::vector range); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRangeIn { + ForRangeIn(const CPUDeviceContext& dev_ctx, std::vector range) + : range_(range) {} + + template + void operator()(Function func) const { + for (auto i : range_) { + func(i); + } + } + + std::vector range_; +}; + template struct ForRange { ForRange(const DeviceContext& dev_ctx, size_t limit); @@ -79,6 +106,34 @@ struct ForRange { int limit_; }; +template +__global__ static void ForRangeInElemwiseOp(Function func, T* vector, + int vector_size) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < vector_size) { + func(vector[idx]); + } +} + +template <> +struct ForRangeIn { + ForRange(const CUDADeviceContext& dev_ctx, std::vector range) + : dev_ctx_(dev_ctx), range_(range) {} + + template + inline void operator()(Function func) const { + constexpr int num_threads = 1024; + int block_size = range_.size() <= num_threads ? limit_ : num_threads; + int grid_size = (range_.size() + num_threads - 1) / num_threads; + + ForRangeInElemwiseOp<<>>( + func, range_.data(), range_.size()); + } + + const CUDADeviceContext& dev_ctx_; + framework::Vector range_; +}; + #endif } // namespace platform From 01dd9061a007dea207d304d1099cfe012a47fcb7 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 17 Dec 2018 10:42:21 +0800 Subject: [PATCH 367/719] add avx support for windows test=develop --- CMakeLists.txt | 2 - paddle/fluid/operators/math/cpu_vec.h | 3 - .../math/detail/activation_functions.h | 6 +- .../operators/math/detail/avx_functions.cc | 4 +- .../fluid/operators/math/detail/avx_mathfun.h | 731 ++++++++++++++++++ paddle/fluid/operators/math/jit_code.cc | 39 +- paddle/fluid/operators/math/jit_code.h | 1 - .../operators/math/jit_kernel_crf_decode.cc | 7 +- .../operators/math/jit_kernel_layer_norm.cc | 7 +- 9 files changed, 757 insertions(+), 43 deletions(-) create mode 100644 paddle/fluid/operators/math/detail/avx_mathfun.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1594e798a2..653ae4ffe5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,8 +131,6 @@ if (APPLE OR WIN32) endif() if (WIN32) - set(WITH_AVX OFF CACHE STRING - "Disable AVX when compiling for Windows" FORCE) set(WITH_DSO OFF CACHE STRING "Disable DSO when compiling for Windows" FORCE) set(WITH_MKL OFF CACHE STRING diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index e1e4d168db..57726956cf 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -18,9 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" -#ifdef __AVX__ -#include -#endif #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 2b3d38d95a..24df1f93ed 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -15,14 +15,10 @@ limitations under the License. */ #pragma once #include #include - +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc index 5641f91452..022ffc5337 100644 --- a/paddle/fluid/operators/math/detail/avx_functions.cc +++ b/paddle/fluid/operators/math/detail/avx_functions.cc @@ -14,10 +14,8 @@ limitations under the License. */ #ifdef __AVX__ -#include #include "paddle/fluid/operators/math/detail/activation_functions.h" -// TODO(qingqing) refine this dependence -#include "paddle/legacy/cuda/src/avx_mathfun.h" +#include "paddle/fluid/operators/math/detail/avx_mathfun.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/fluid/operators/math/detail/avx_mathfun.h new file mode 100644 index 0000000000..d7cf91134e --- /dev/null +++ b/paddle/fluid/operators/math/detail/avx_mathfun.h @@ -0,0 +1,731 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + AVX implementation of sin, cos, sincos, exp and log + + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include "paddle/fluid/platform/cpu_info.h" + +/* __m128 is ugly to write */ +typedef __m256 v8sf; // vector of 8 float (avx) +typedef __m256i v8si; // vector of 8 int (avx) +typedef __m128i v4si; // vector of 8 int (avx) + +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \ + Val, Val} + +_PI32AVX_CONST(1, 1); +_PI32AVX_CONST(inv1, ~1); +_PI32AVX_CONST(2, 2); +_PI32AVX_CONST(4, 4); + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} + +_PS256_CONST(1, 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2); +_PS256_CONST(cephes_log_p1, -1.1514610310E-1); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1); +_PS256_CONST(cephes_log_p3, -1.2420140846E-1); +_PS256_CONST(cephes_log_p4, +1.4249322787E-1); +_PS256_CONST(cephes_log_p5, -1.6668057665E-1); +_PS256_CONST(cephes_log_p6, +2.0000714765E-1); +_PS256_CONST(cephes_log_p7, -2.4999993993E-1); +_PS256_CONST(cephes_log_p8, +3.3333331174E-1); +_PS256_CONST(cephes_log_q1, -2.12194440e-4); +_PS256_CONST(cephes_log_q2, 0.693359375); + +#ifndef __AVX2__ + +typedef union imm_xmm_union { + v8si imm; + v4si xmm[2]; +} imm_xmm_union; + +#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ + { \ + imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ + u.imm = imm_; \ + xmm0_ = u.xmm[0]; \ + xmm1_ = u.xmm[1]; \ + } + +#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ + { \ + imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ + u.xmm[0] = xmm0_; \ + u.xmm[1] = xmm1_; \ + imm_ = u.imm; \ + } + +#define AVX2_BITOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, int a) { \ + /* use SSE2 instruction to perform the bitop AVX2 */ \ + v4si x1, x2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + x1 = _mm_##fn(x1, a); \ + x2 = _mm_##fn(x2, a); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } + +//#warning "Using SSE2 to perform AVX2 bitshift ops" +AVX2_BITOP_USING_SSE2(slli_epi32) +AVX2_BITOP_USING_SSE2(srli_epi32) + +#define AVX2_INTOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \ + /* use SSE2 instructions to perform the AVX2 integer operation */ \ + v4si x1, x2; \ + v4si y1, y2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + COPY_IMM_TO_XMM(y, y1, y2); \ + x1 = _mm_##fn(x1, y1); \ + x2 = _mm_##fn(x2, y2); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } + +//#warning "Using SSE2 to perform AVX2 integer ops" +AVX2_INTOP_USING_SSE2(and_si128) +AVX2_INTOP_USING_SSE2(andnot_si128) +AVX2_INTOP_USING_SSE2(cmpeq_epi32) +AVX2_INTOP_USING_SSE2(sub_epi32) +AVX2_INTOP_USING_SSE2(add_epi32) +#define avx2_mm256_and_si256 avx2_mm256_and_si128 +#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128 +#else +#define avx2_mm256_slli_epi32 _mm256_slli_epi32 +#define avx2_mm256_srli_epi32 _mm256_srli_epi32 +#define avx2_mm256_and_si256 _mm256_and_si256 +#define avx2_mm256_andnot_si256 _mm256_andnot_si256 +#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32 +#define avx2_mm256_sub_epi32 _mm256_sub_epi32 +#define avx2_mm256_add_epi32 _mm256_add_epi32 +#endif /* __AVX2__ */ + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +v8sf log256_ps(v8sf x) { + v8si imm0; + v8sf one = *(v8sf *)_ps256_1; + + // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps( + x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); + v8sf e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + v8sf z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf *)_ps256_cephes_log_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); + + tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + y = _mm256_sub_ps(y, tmp); + + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); + x = _mm256_add_ps(x, y); + x = _mm256_add_ps(x, tmp); + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +v8sf exp256_ps(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf *)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + // imm0 = _mm256_cvttps_epi32(fx); + // tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, substract 1 */ + // v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf *)_ps256_cephes_exp_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS256_CONST(sincof_p0, -1.9515295891E-4); +_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p2, -1.6666654611E-1); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p1, -1.388731625493765E-003); +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + +/* evaluation of 8 sines at onces using AVX intrisics + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + +*/ +v8sf sin256_ps(v8sf x) { // any x + v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + v8si imm0, imm2; + +#ifndef __AVX2__ + v4si imm0_1, imm0_2; + v4si imm2_1, imm2_2; +#endif + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + +/* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives +*/ + +#ifdef __AVX2__ + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4= 256 diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index e2b4761435..6d22bf6757 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -47,7 +47,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -#define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index eeb305a88b..ac2d29f1c1 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -16,9 +16,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" -#ifdef __AVX__ -#include -#endif namespace paddle { namespace operators { @@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { /* AVX instructions.*/ \ __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \ __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \ - __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); \ - __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); \ + __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0); \ + __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1); \ lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \ hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \ lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \ diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index cb49e66488..e21092037a 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -13,9 +13,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" -#ifdef __AVX__ -#include -#endif namespace paddle { namespace operators { @@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel { if (rest_ != 0) { \ j = offset + this->num_ - block; \ tmp = _mm256_loadu_ps((const float*)x + j); \ - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ + tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \ sum = _mm256_add_ps(sum, tmp); \ } \ hi = _mm256_extractf128_ps(sum, 1); \ @@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel { j = offset + this->num_ - block; \ tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \ tmp = _mm256_mul_ps(tmp, tmp); \ - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ + tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \ sum = _mm256_add_ps(sum, tmp); \ } \ hi = _mm256_extractf128_ps(sum, 1); \ From e196fa367bc6087f08bfce44bdc194ed426c69cf Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 17 Dec 2018 10:52:05 +0800 Subject: [PATCH 368/719] update ut, test=develop --- .../unittests/test_nce_remote_table_op.py | 271 ++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py new file mode 100644 index 0000000000..f08b270d89 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -0,0 +1,271 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_nce_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_nce_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_nce_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self._run_nce_op_one_pserver(place, port0) + self._run_nce_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() From 5553c0b0da5884a062e5b7b136c30eb12a7d4d6b Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 10:24:46 +0800 Subject: [PATCH 369/719] add API.spec test=develop --- paddle/fluid/API.spec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e156945147..fe2ee3f98d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -37,14 +37,14 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) From 763e8fdf02ebe00b845680b264b7a5c6a56b61ae Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 11:17:10 +0800 Subject: [PATCH 370/719] fix compile error --- paddle/fluid/platform/for_range.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index 9fbaa36723..a767bf9299 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -117,17 +117,18 @@ __global__ static void ForRangeInElemwiseOp(Function func, T* vector, template <> struct ForRangeIn { - ForRange(const CUDADeviceContext& dev_ctx, std::vector range) + ForRangeIn(const CUDADeviceContext& dev_ctx, std::vector range) : dev_ctx_(dev_ctx), range_(range) {} template inline void operator()(Function func) const { constexpr int num_threads = 1024; - int block_size = range_.size() <= num_threads ? limit_ : num_threads; + int range_size = range_.size(); + int block_size = range_size <= num_threads ? range_size : num_threads; int grid_size = (range_.size() + num_threads - 1) / num_threads; ForRangeInElemwiseOp<<>>( - func, range_.data(), range_.size()); + func, range_.data(), range_size); } const CUDADeviceContext& dev_ctx_; From e439257ef7880e9b0b19d7b0c7ef8965fc180279 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 17 Dec 2018 11:24:19 +0800 Subject: [PATCH 371/719] Fix include style test=develop --- paddle/fluid/framework/tensor.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 6ddc07af9a..6a1cbe5cd5 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -14,15 +14,14 @@ limitations under the License. */ #pragma once -#include #include #include #include #include #include - #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" From d519fd69441be6bc1bf1d921d23144094a90bfb8 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 17 Dec 2018 11:38:54 +0800 Subject: [PATCH 372/719] test=develop --- paddle/fluid/platform/cpu_info.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 55dba545ff..70224f9467 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -16,6 +16,32 @@ limitations under the License. */ #include +#ifdef _WIN32 +#if defined(__AVX2__) +#include //avx2 +#elif defined(__AVX__) +#include //avx +#endif // AVX +#else // WIN32 +#ifdef __AVX__ +#include +#endif +#endif // WIN32 + +#if defined(_WIN32) +#define ALIGN32_BEG __declspec(align(32)) +#define ALIGN32_END +#else +#define ALIGN32_BEG +#define ALIGN32_END __attribute__((aligned(32))) +#endif // _WIN32 + +#if defined(_WIN32) +#if defined(__AVX2__) || defined(__AVX__) +inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } +#endif +#endif + namespace paddle { namespace platform { From bd0067b26cd899b63bbf314aa3ba5f3ac22327e6 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 12:39:33 +0800 Subject: [PATCH 373/719] Polish code test=develop --- paddle/fluid/operators/math/selected_rows_functor.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index b87c9461e8..0d63f641c8 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -275,7 +275,8 @@ struct MergeAdd { void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { framework::Vector input_rows(input.rows()); if (input_rows.size() == 0) { return; @@ -313,7 +314,8 @@ struct MergeAdd { void operator()(const platform::CUDADeviceContext& context, const std::vector& inputs, - framework::SelectedRows* output) { + framework::SelectedRows* output, + const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; From fd144954ed06128bab0b8b99cdb6722cc52881ba Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 13:13:53 +0800 Subject: [PATCH 374/719] redefine api test=develop --- paddle/fluid/API.spec | 1 - .../fluid/framework/details/build_strategy.cc | 4 +- .../fluid/framework/details/build_strategy.h | 2 + .../framework/details/execution_strategy.h | 2 +- .../details/parallel_ssa_graph_executor.cc | 1 - paddle/fluid/framework/ir/node.h | 1 - paddle/fluid/framework/parallel_executor.cc | 29 +++++----- paddle/fluid/pybind/pybind.cc | 43 +++++++------- .../unittests/parallel_executor_test_base.py | 41 +++++++------- .../unittests/test_parallel_executor_mnist.py | 56 +++++++++++-------- .../test_parallel_executor_seresnext.py | 10 ++-- .../test_parallel_executor_transformer.py | 4 +- 12 files changed, 101 insertions(+), 93 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index da3b2f6347..8e6482ca98 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,7 +26,6 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.ExecutorType.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy.ExecutorType, arg0: int) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f24..e9688ea276 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -26,7 +26,9 @@ namespace framework { namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { - return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1); + return (!strategy.enable_sequential_execution_ && + strategy.num_trainers_ > 1) || + strategy.enable_parallel_graph_; } class ParallelExecutorPassBuilder : public ir::PassBuilder { diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be16957..f66ecd80f1 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -73,6 +73,8 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; + bool enable_parallel_graph_{false}; + int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index d3d5b6bf54..15c496130c 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,7 +20,7 @@ namespace framework { namespace details { struct ExecutionStrategy { - enum ExecutorType { kDefault = 0, kExperimental = 1, kParallelGraph = 2 }; + enum ExecutorType { kDefault = 0, kExperimental = 1 }; size_t num_threads_{0}; bool use_cuda_{true}; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 214c2f7625..845c4379e6 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -29,7 +29,6 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( graphs_(std::move(graphs)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); // do not use threadpool for each graph execution. - strategy_.num_threads_ = 1UL; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f1..10ae3a1c74 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -49,7 +49,6 @@ class Node { public: virtual ~Node() { if (!wrapper_.empty()) { - VLOG(4) << "ir::Node deleting a wrapper node " << Name(); wrapper_deleter_(); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 63f3ef0eac..152b9b2702 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -199,7 +199,7 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); } - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { PADDLE_ENFORCE( member_->use_all_reduce_, "build_strategy.reduce should be `AllReduce` if you want to use" @@ -231,7 +231,7 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. if (nccl_id_var == nullptr) { @@ -265,7 +265,7 @@ ParallelExecutor::ParallelExecutor( // ncclOp std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { for (size_t i = 0; i < member_->places_.size(); ++i) { std::unique_ptr graph = build_strategy.Apply( main_program, {member_->places_[i]}, loss_var_name, params, @@ -287,9 +287,8 @@ ParallelExecutor::ParallelExecutor( #endif auto max_memory_size = GetEagerDeletionThreshold(); - // TODO(Yancey1989): fix gc failed on ParallelGraph executor. - if (max_memory_size >= 0 && - exec_strategy.type_ != ExecutionStrategy::kParallelGraph) { + // TODO(Yancey1989): fix gc failed on ParallelGraph strategy. + if (max_memory_size >= 0 && !build_strategy.enable_parallel_graph_) { graphs[0] = member_->PrepareGCAndRefCnts( std::move(graphs[0]), static_cast(max_memory_size)); } @@ -323,18 +322,20 @@ ParallelExecutor::ParallelExecutor( } } - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); - } else if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { + if (build_strategy.enable_parallel_graph_) { member_->executor_.reset(new details::ParallelSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, std::move(graphs))); } else { - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } else { + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1fa91114a8..866a5137de 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -761,11 +761,6 @@ All parameter, weight, gradient are variables in Paddle. )DOC"); - py::enum_(exec_strategy, "ExecutorType") - .value("Default", ExecutionStrategy::ExecutorType::kDefault) - .value("Experimental", ExecutionStrategy::ExecutorType::kExperimental) - .value("ParallelGraph", ExecutionStrategy::ExecutorType::kParallelGraph); - exec_strategy.def(py::init()) .def_property( "num_threads", @@ -823,25 +818,17 @@ All parameter, weight, gradient are variables in Paddle. [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { self.dry_run_ = dry_run; - }) - .def_property( - "executor_type", - [](const ExecutionStrategy &self) { return self.type_; }, - [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) { - self.type_ = type; - }, - R"DOC(The type is ExecutorType which is the enum ranging from Default, -ParallelGraph and Experiment: - -Default: Compile the main_program into a multi-devices graph, - and execute this graph on multi-devices with multiple threads which - specified by build_strategy.num_threads. -ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one - device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True. - This approach can achieve better performance in some scenarios. -Experimental: Compile the main_program into a multi-devices graph, - and executor this graph with a faster execution mode than the Default, - this approach is on the experiments.)DOC"); + }); + + exec_strategy.def_property( + "use_experimental_executor", + [](const ExecutionStrategy &self) { + return self.type_ == ExecutionStrategy::kExperimental; + }, + [](ExecutionStrategy &self, bool experimental) { + self.type_ = experimental ? ExecutionStrategy::kExperimental + : ExecutionStrategy::kDefault; + }); py::class_ build_strategy(pe, "BuildStrategy", R"DOC( BuildStrategy allows the user to more preciously control how to @@ -964,6 +951,14 @@ Experimental: Compile the main_program into a multi-devices graph, R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "enable_parallel_graph", + [](const BuildStrategy &self) { return self.enable_parallel_graph_; }, + [](BuildStrategy &self, bool b) { self.enable_parallel_graph_ = b; }, + R"DOC(The type is BOOL, if set True, ParallelExecutor would build the main_program into multiple graphs, + each of the graphs would run with one device. This approach can achieve better performance in + some scenarios. Please note, this approach only supports all-reduce mode + on GPU device)DOC") .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 73b8fb74fa..4e50614515 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -26,26 +26,24 @@ import sys __all__ = ['TestParallelExecutorBase'] -ExecutorType = fluid.ExecutionStrategy().ExecutorType - class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence( - self, - method, - use_cuda=True, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - use_reduce=False, - fuse_elewise_add_act_ops=False, - optimizer=fluid.optimizer.Adam, - exec_type=fluid.ExecutionStrategy().ExecutorType.Default, - enable_sequential_execution=False): + def check_network_convergence(self, + method, + use_cuda=True, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + use_reduce=False, + use_parallel_graph=False, + fuse_elewise_add_act_ops=False, + optimizer=fluid.optimizer.Adam, + use_fast_executor=False, + enable_sequential_execution=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -61,8 +59,8 @@ class TestParallelExecutorBase(unittest.TestCase): startup = fluid.Program() startup.random_seed = 1 # Fix random seed main.random_seed = 1 - scope = fluid.Scope() - with fluid.scope_guard(scope): + self.scope = fluid.Scope() + with fluid.scope_guard(self.scope): with fluid.program_guard(main, startup): if seed is not None: startup.random_seed = seed @@ -80,13 +78,14 @@ class TestParallelExecutorBase(unittest.TestCase): startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = allow_op_delay - exec_strategy.executor_type = exec_type + exec_strategy.use_experimental_executor = use_fast_executor build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.enable_sequential_execution = enable_sequential_execution + build_strategy.enable_parallel_graph = use_parallel_graph if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index fffe8bee58..c8ac6a90c1 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -20,7 +20,7 @@ import numpy as np import paddle.fluid.core as core import os import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase def simple_fc_net(use_feed): @@ -79,30 +79,32 @@ class TestMNIST(TestParallelExecutorBase): return img, label = self._init_data() - + """ all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) + """ reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=True) - + """ for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) + """ # simple_fc def check_simple_fc_convergence(self, use_cuda, use_reduce=False, - exec_type=ExecutorType.Default): + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -114,20 +116,24 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_reduce=use_reduce, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) - def test_simple_fc(self): + def notest_simple_fc(self): # use_cuda - self.check_simple_fc_convergence(True, ExecutorType.Default) - self.check_simple_fc_convergence(True, ExecutorType.ParallelGraph) + if core.is_compiled_with_cuda(): + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence( + True, use_reduce=False, use_parallel_graph=True) self.check_simple_fc_convergence(False) - def test_simple_fc_with_new_strategy(self): + def notest_simple_fc_with_new_strategy(self): # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) - def check_simple_fc_parallel_accuracy(self, use_cuda, exec_type): + def check_simple_fc_parallel_accuracy(self, + use_cuda, + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -140,7 +146,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=False, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1, @@ -148,7 +154,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=True, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( np.mean(parallel_first_loss), @@ -157,17 +163,20 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss, delta=1e-6) - def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True, ExecutorType.Default) - self.check_simple_fc_parallel_accuracy(True, ExecutorType.ParallelGraph) + def notest_simple_fc_parallel_accuracy(self): + if core.is_compiled_with_cuda(): + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy( + True, use_parallel_graph=True) # FIXME(Yancey1989): ParallelGraph executor type support CPU mode - self.check_simple_fc_parallel_accuracy(False, ExecutorType.Default) + self.check_simple_fc_parallel_accuracy(False) - def check_batchnorm_fc_convergence(self, use_cuda, exec_type): + def check_batchnorm_fc_convergence(self, + use_cuda, + use_fast_executor, + use_parallel_graph=False): if use_cuda and not core.is_compiled_with_cuda(): return - if not use_cuda and exec_type == ExecutorType.ParallelGraph: - return img, label = self._init_data() @@ -176,13 +185,14 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - exec_type=exec_type) + use_fast_executor=use_fast_executor, + use_parallel_graph=use_parallel_graph) def test_batchnorm_fc(self): for use_cuda in (False, True): - for exec_type in (ExecutorType.Default, ExecutorType.Experimental, - ExecutorType.ParallelGraph): - self.check_batchnorm_fc_convergence(use_cuda, exec_type) + for use_fast_executor in (False, True): + self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) + self.check_batchnorm_fc_convergence(use_cuda, False, True) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index bada38894f..531c99a835 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -19,7 +19,7 @@ import paddle.fluid.layers.ops as ops from paddle.fluid.initializer import init_on_cpu from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter import paddle.fluid.core as core -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase import unittest import math import os @@ -282,7 +282,7 @@ class TestResnet(TestParallelExecutorBase): use_reduce=False, iter=20, delta2=1e-6, - exec_type=ExecutorType.Default, + use_parallel_graph=False, lr_scale=1.0): if use_cuda and not core.is_compiled_with_cuda(): return @@ -303,7 +303,7 @@ class TestResnet(TestParallelExecutorBase): use_reduce=use_reduce, optimizer=optimizer(), use_parallel_executor=False, - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) parallel_first_loss, parallel_last_loss = self.check_network_convergence( model, feed_dict={"image": img, @@ -313,7 +313,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce, optimizer=optimizer(lr_scale=lr_scale), - exec_type=exec_type) + use_parallel_graph=use_parallel_graph) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) @@ -327,7 +327,7 @@ class TestResnet(TestParallelExecutorBase): self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=True, - exec_type=ExecutorType.ParallelGraph, + use_parallel_graph=True, lr_scale=core.get_cuda_device_count()) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 8a1a3ab3ca..c3ac9d92b4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -17,7 +17,7 @@ from __future__ import print_function import paddle.fluid as fluid import transformer_model import numpy as np -from parallel_executor_test_base import TestParallelExecutorBase, ExecutorType +from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle import paddle.fluid.core as core @@ -175,6 +175,8 @@ class TestTransformer(TestParallelExecutorBase): self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) + self.check_network_convergence( + transformer, use_cuda=True, use_parallel_graph=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) From 728e7e88fb2c3467f6e28ef968b4e720d290b26c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 13:37:57 +0800 Subject: [PATCH 375/719] Use xxHash as scope's hash algorithm test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/scope.h | 26 ++++++++++++++++++++------ python/paddle/fluid/profiler.py | 2 +- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cea4a44857..5dca5ac598 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -82,7 +82,7 @@ cc_test(variable_test SRCS variable_test.cc) cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash) cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index b1abe75d76..4f79d98260 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -201,7 +201,7 @@ void Scope::RenameInternal(const std::string& origin_name, auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name].reset(origin_it.value().release()); + vars_[new_name].reset(origin_it->second.release()); vars_.erase(origin_it); } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index b232d267db..77ef18414d 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,15 +14,18 @@ limitations under the License. */ #pragma once +extern "C" { +#include +} + #include #include #include #include +#include #include #include -#include // NOLINT - #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -35,6 +38,14 @@ bool IsFastEagerDeletionModeEnabled(); class Scope; +namespace inner { +struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } +}; +} // namespace inner + /** * @brief Scope that manage all variables. * @@ -99,11 +110,14 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable tsl::robin_map< - std::string, std::unique_ptr, std::hash, - std::equal_to, - std::allocator>>, true> + mutable std::unordered_map, + inner::KeyHasher> vars_; + // mutable tsl::robin_map< + // std::string, std::unique_ptr, std::hash, + // std::equal_to, + // std::allocator>>, true> + // vars_; private: // Call Scope::NewScope for a sub-scope. diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 8df2e01b03..78f7a6ac08 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -93,7 +93,7 @@ def cuda_profiler(output_file, output_mode=None, config=None): with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) #Comment this for nvprof - #core.nvprof_init(output_file, output_mode, config_file) + core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield From 4de1a8bd9d55469f0612cf8f60b749681a5d657c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 17 Dec 2018 14:15:27 +0800 Subject: [PATCH 376/719] Remove unused cmake log test=develop --- cmake/external/python.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 52ad02a355..623c53f4f7 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND) "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() ENDIF(PYTHONINTERP_FOUND) -message(STATUS ${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) From a7d6b1f92141f398cb442e3f5eee99d3ac156265 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 14:17:26 +0800 Subject: [PATCH 377/719] code cleanup test=develop --- .../framework/details/computation_op_handle.h | 1 - .../details/multi_devices_graph_pass.cc | 1 + .../scope_buffered_ssa_graph_executor.cc | 2 ++ .../details/threaded_ssa_graph_executor.h | 1 - paddle/fluid/framework/details/var_handle.cc | 2 +- paddle/fluid/framework/ir/node.h | 1 + .../unittests/test_parallel_executor_dry_run.py | 10 ++++------ .../unittests/test_parallel_executor_mnist.py | 17 +++++++++-------- 8 files changed, 18 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 5b8b70c564..601ae4f8c6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -17,7 +17,6 @@ #include #include -#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 5b82805ad9..2ab7da2d57 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,6 +134,7 @@ static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; static const char kNumTrainers[] = "num_trainers"; +static const char kNumLossScaled[] = "num_loss_scaled"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index edb7b5e70a..f432079087 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -41,10 +41,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( Scope &local_scope = scope->NewScope(); *scope->Var(details::kLocalExecScopeName)->GetMutable() = &local_scope; + for (auto &info : var_infos_) { if (scope->FindVar(info.name_) != nullptr) { continue; } + if (info.persistable_) { // Persistable InitializeVariable(scope->Var(info.name_), info.type_); } else { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index b45afbc046..24da56c09e 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,7 +24,6 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 7de6025a28..30da029ca2 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -20,7 +20,7 @@ namespace details { VarHandleBase::~VarHandleBase() {} -VarHandle::~VarHandle() { VLOG(5) << "deleting var handle " << DebugString(); } +VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 10ae3a1c74..d2a393b3f1 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -49,6 +49,7 @@ class Node { public: virtual ~Node() { if (!wrapper_.empty()) { + VLOG(4) << "ir::Node deleting a wrapper node " << Name(); wrapper_deleter_(); } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index eff76ce0d4..18d95c94ad 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -17,8 +17,6 @@ import unittest import logging import six -ExecutorType = fluid.ExecutionStrategy().ExecutorType - class TestBase(unittest.TestCase): def main(self, @@ -26,7 +24,7 @@ class TestBase(unittest.TestCase): iter=10, iter_per_pe=10, use_gpu=True, - exec_type=ExecutorType.Default): + use_experimental_executor=False): if use_gpu and not fluid.core.is_compiled_with_cuda(): logging.warning( "Paddle is not compiled with CUDA, skip GPU unittests") @@ -45,7 +43,7 @@ class TestBase(unittest.TestCase): for _ in six.moves.xrange(iter): exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True - exe_strategy.executor_type = exec_type + exe_strategy.use_experimental_executor = use_experimental_executor pe = fluid.ParallelExecutor( use_cuda=use_gpu, loss_name=loss.name, @@ -58,11 +56,11 @@ class TestBase(unittest.TestCase): class TestMNISTDryRun(TestBase): def test_mnist_dry_run(self): for use_gpu in (False, True): - for exec_type in (ExecutorType.Default, ExecutorType.Experimental): + for use_experimental_executor in (False, True): self.main( network_func=TestMNISTDryRun.network_func, use_gpu=use_gpu, - exec_type=exec_type) + use_experimental_executor=use_experimental_executor) @staticmethod def network_func(): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index c8ac6a90c1..7d2349fad4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -79,26 +79,25 @@ class TestMNIST(TestParallelExecutorBase): return img, label = self._init_data() - """ + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=False) - """ + reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_reduce=True) - """ + for loss in zip(all_reduce_first_loss, reduce_first_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEqual(loss[0], loss[1], delta=1e-4) - """ # simple_fc def check_simple_fc_convergence(self, @@ -118,7 +117,7 @@ class TestMNIST(TestParallelExecutorBase): use_reduce=use_reduce, use_parallel_graph=use_parallel_graph) - def notest_simple_fc(self): + def test_simple_fc(self): # use_cuda if core.is_compiled_with_cuda(): self.check_simple_fc_convergence(True) @@ -126,7 +125,7 @@ class TestMNIST(TestParallelExecutorBase): True, use_reduce=False, use_parallel_graph=True) self.check_simple_fc_convergence(False) - def notest_simple_fc_with_new_strategy(self): + def test_simple_fc_with_new_strategy(self): # use_cuda, use_reduce self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, False) @@ -163,7 +162,7 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss, delta=1e-6) - def notest_simple_fc_parallel_accuracy(self): + def test_simple_fc_parallel_accuracy(self): if core.is_compiled_with_cuda(): self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy( @@ -192,7 +191,9 @@ class TestMNIST(TestParallelExecutorBase): for use_cuda in (False, True): for use_fast_executor in (False, True): self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) - self.check_batchnorm_fc_convergence(use_cuda, False, True) + + self.check_batchnorm_fc_convergence( + use_cuda=True, use_fast_executor=False, use_parallel_graph=True) def test_batchnorm_fc_with_new_strategy(self): # FIXME(zcd): close this test temporally. From 24eb8f038c9e7c5fddac7ebd5ececd0459e2250d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 15:03:17 +0800 Subject: [PATCH 378/719] Fix bug test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 46 ++++++++++++++------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index c9e27b7547..fa51c66eeb 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -279,26 +279,42 @@ struct SparseAdamFunctor { T beta1_pow = *beta1_pow_; T beta2_pow = *beta2_pow_; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - for (size_t i = 0U, j = 0U; i != numel; ++i) { - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T p = param_[i]; + size_t row_count = numel / row_numel_; - // Calculation + for (size_t i = 0U, j = 0U; i != row_count; ++i) { if (i == *(rows_ + j)) { - T g = grad_[j * row_numel_]; - mom1 = beta1_ * mom1 + (1 - beta1_) * g; - mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + for (size_t k = 0U; k != row_numel_; ++k) { + T mom1 = moment1_[i * row_numel_ + k]; + T mom2 = moment2_[i * row_numel_ + k]; + T p = param_[i * row_numel_ + k]; + + T g = grad_[j * row_numel_ + k]; + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i * row_numel_ + k] = mom1; + moment2_out_[i * row_numel_ + k] = mom2; + param_out_[i * row_numel_ + k] = p; + } ++j; } else { - mom1 = beta1_ * mom1; - mom2 = beta2_ * mom2; + for (size_t k = 0U; k != row_numel_; ++k) { + T mom1 = moment1_[i * row_numel_ + k]; + T mom2 = moment2_[i * row_numel_ + k]; + T p = param_[i * row_numel_ + k]; + + mom1 = beta1_ * mom1; + mom2 = beta2_ * mom2; + + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i * row_numel_ + k] = mom1; + moment2_out_[i * row_numel_ + k] = mom2; + param_out_[i * row_numel_ + k] = p; + } } - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = p; } } }; From 41456e172384667de0b8b272fa5714c96a73a297 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 17 Dec 2018 15:19:17 +0800 Subject: [PATCH 379/719] Remove the useless definition test=develop --- paddle/fluid/platform/cpu_info.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 70224f9467..c70e3be858 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -36,12 +36,6 @@ limitations under the License. */ #define ALIGN32_END __attribute__((aligned(32))) #endif // _WIN32 -#if defined(_WIN32) -#if defined(__AVX2__) || defined(__AVX__) -inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } -#endif -#endif - namespace paddle { namespace platform { From bc4f16ca6f4bcb938683086c0eb325366729cd25 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 17 Dec 2018 10:24:46 +0800 Subject: [PATCH 380/719] remove some comments --- paddle/fluid/API.spec | 4 ++-- python/paddle/fluid/distributed/helper.py | 6 +----- python/paddle/fluid/distributed/node.py | 4 ---- python/paddle/fluid/distributed/ps_instance.py | 3 --- 4 files changed, 3 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e156945147..fe2ee3f98d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -37,14 +37,14 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index cdde5403cd..06d3d0315c 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -28,7 +28,7 @@ class FileSystem(object): def __init__(self, fs_type="afs", - uri="afs://tianqi.afs.baidu.com:9902", + uri="afs://xx", user=None, passwd=None, hadoop_bin=""): @@ -37,10 +37,6 @@ class FileSystem(object): assert hadoop_bin != None import ps_pb2 as pslib self.fs_client = pslib.FsClientParameter() - #if fs_type == "afs": - # fs_client.fs_type = pslib.FsApiType.AFS - #else: - # fs_client.fs_type = pslib.FsApiType.HDFS self.fs_client.uri = uri self.fs_client.user = user self.fs_client.passwd = passwd diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py index 117da9cff8..41e0d64e0b 100644 --- a/python/paddle/fluid/distributed/node.py +++ b/python/paddle/fluid/distributed/node.py @@ -75,8 +75,6 @@ class DownpourServer(Server): table.accessor.embedx_dim = 8 table.accessor.embedx_threshold = 5 table.accessor.fea_dim = 11 - #table.accessor.fea_dim = abs(reduce(lambda x, y: x * y, - # slot_value_var[0].shape, 1)) table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 table.accessor.downpour_accessor_param.click_coeff = 2 table.accessor.downpour_accessor_param.base_threshold = 0.2 @@ -134,8 +132,6 @@ class DownpourWorker(Worker): def __init__(self, window): self.window = window self.worker_ = pslib.DownpourTrainerParameter() - #self.worker_.pull_dense_per_batch = window - #self.worker_.push_dense_per_batch = window def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_vars): diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index 91f53102b6..d3ce3ce693 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -59,9 +59,6 @@ class PaddlePSInstance(object): else: self._node_type = -1 - #if self._rankid == 0: - #print "node type: ", self._node_type - def _split_comm(self): if self.is_server(): self._comm = self.dh.comm.Split(self._node_type) From 5f0358add9767390b8bc329c97236f8d72ce758e Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Mon, 17 Dec 2018 16:18:10 +0800 Subject: [PATCH 381/719] async_executor stop add barrier_all & finalize --- python/paddle/fluid/async_executor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index bd32138651..3181654feb 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -237,6 +237,8 @@ class AsyncExecutor(object): if self.instance.is_first_worker(): self.executor.stop_server() self.instance.barrier_worker() #sync + self.instance.barrier_all() + self.instance.finalize() def init_server(self, dist_desc): """ From 4e3e68dfae0f72c848aea60771258b8600202368 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 17 Dec 2018 08:19:54 +0000 Subject: [PATCH 382/719] copy trt lib to inference lib test=develop --- cmake/inference_lib.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c679d8507d..b2fb042f77 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -191,6 +191,13 @@ if (WITH_ANAKIN AND WITH_MKL) list(APPEND inference_deps anakin_inference_lib) endif () +if (TENSORRT_FOUND) + copy(tensorrt_lib DEPS ${inference_deps} + SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer* + DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib) +endif () + + set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* From 96604fda1016bd91c25ace7e7510f0a746ed3797 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 16:59:20 +0800 Subject: [PATCH 383/719] fix gpu data test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 3 ++- paddle/fluid/platform/for_range.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index e8b977e2d9..01d3d60054 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -361,9 +361,10 @@ class AdamOpKernel : public framework::OpKernel { if (lazy_mode) { std::vector id_vector; size_t row_count = grad_merge.rows().size(); + std::vector cpu_rows(grad_merge.rows()); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { - size_t i = rows[row_index] * row_numel + offset; + size_t i = cpu_rows[row_index] * row_numel + offset; id_vector.push_back(i); } } diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index a767bf9299..ab00d8b8f5 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -128,7 +128,7 @@ struct ForRangeIn { int grid_size = (range_.size() + num_threads - 1) / num_threads; ForRangeInElemwiseOp<<>>( - func, range_.data(), range_size); + func, range_.CUDAData(dev_ctx_.GetPlace()), range_size); } const CUDADeviceContext& dev_ctx_; From aa41ee75a16509cb16793d7fdbbbfa3ce2dab69f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 17:13:26 +0800 Subject: [PATCH 384/719] Accelerate PADDLE_ENFORCE --- paddle/fluid/framework/operator.h | 12 ++++-- paddle/fluid/platform/enforce.h | 68 +++++++++++++++++++------------ 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bc..63a8bc574f 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; /// e.g. Variable "x@GRAD" is the gradient of varibale "x". constexpr char kGradVarSuffix[] = "@GRAD"; +constexpr size_t kGradVarSuffixSize = 5U; + /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; @@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; extern std::vector> kKernelPriority; inline std::string GradVarName(const std::string& var_name) { - return var_name + kGradVarSuffix; + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; } proto::VarType::Type GetDataTypeOfVar(const Variable* var); @@ -101,8 +107,8 @@ class OperatorBase { bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { - PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", - name); + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); return boost::get(attrs_.at(name)); } const AttributeMap& Attrs() const { return attrs_; } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 01ee67fd07..3c03a90279 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -140,68 +140,72 @@ struct EOFException : public std::exception { #define LIKELY(condition) (condition) #endif +inline bool is_error(bool stat) { return !stat; } + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { - if (UNLIKELY(!(stat))) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } } #ifdef PADDLE_WITH_CUDA +inline bool is_error(cudaError_t e) { return UNLIKELY(e); } + template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { - if (UNLIKELY(e)) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { - if (stat != CURAND_STATUS_SUCCESS) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cudnnStatus_t stat, const Args&... args) { - if (stat == CUDNN_STATUS_SUCCESS) { - return; - } else { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cublasStatus_t stat, const Args&... args) { std::string err; - if (stat == CUBLAS_STATUS_SUCCESS) { - return; - } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { err = "CUBLAS: alloc failed, "; @@ -254,11 +258,21 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) +#define PADDLE_JUDGE + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(cond))) { \ + ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \ + } \ + } while (0) + #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(...) \ +#define PADDLE_ENFORCE(COND, ...) \ do { \ try { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ @@ -266,7 +280,7 @@ inline void throw_on_error(T e) { } while (false) #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); +#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG #define PADDLE_THROW_EOF() \ From 27a0d6c2dc7a1fb26ec3bfc0b44840300685b993 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 17:17:13 +0800 Subject: [PATCH 385/719] Polish code test=develop --- CMakeLists.txt | 1 - cmake/external/robin_map.cmake | 31 ------------------------------- paddle/fluid/framework/scope.h | 5 ----- python/paddle/fluid/profiler.py | 1 - 4 files changed, 38 deletions(-) delete mode 100644 cmake/external/robin_map.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b2e0ecf6c..1594e798a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,7 +209,6 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream -include(external/robin_map) # download tsl::robin_map if (NOT WIN32) # there is no official support of warpctc, nccl, cupti in windows diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake deleted file mode 100644 index ddaf59536c..0000000000 --- a/cmake/external/robin_map.cmake +++ /dev/null @@ -1,31 +0,0 @@ -include(ExternalProject) - -set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map) -set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include) - -include_directories(${ROBIN_MAP_INCLUDE_DIR}) - -ExternalProject_Add( - extern_robin_map - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Tessil/robin-map.git" - GIT_TAG "v0.5.0" - PREFIX ${ROBIN_MAP_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) - -if(${CMAKE_VERSION} VERSION_LESS "3.3.0") - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c) - file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") - add_library(robin_map STATIC ${dummyfile}) -else() - add_library(robin_map INTERFACE) -endif() - -add_dependencies(robin_map extern_robin_map) - -LIST(APPEND externl_project_dependencies robin_map) diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 77ef18414d..9a715ac9b9 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -113,11 +113,6 @@ class Scope { mutable std::unordered_map, inner::KeyHasher> vars_; - // mutable tsl::robin_map< - // std::string, std::unique_ptr, std::hash, - // std::equal_to, - // std::allocator>>, true> - // vars_; private: // Call Scope::NewScope for a sub-scope. diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 78f7a6ac08..e05885f5f5 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,6 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - #Comment this for nvprof core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() From 49870f507de0d68fe23cd60479dab9da65d2d916 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 18:52:21 +0800 Subject: [PATCH 386/719] delete unused code test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 1 + paddle/fluid/framework/details/multi_devices_graph_pass.cc | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 5a4f218077..59a0aef480 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -51,6 +51,7 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA + // Find NCCL ID from the global scope. if (NoDummyInputSize() == 1 && local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 2ab7da2d57..5b82805ad9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,7 +134,6 @@ static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; static const char kNumTrainers[] = "num_trainers"; -static const char kNumLossScaled[] = "num_loss_scaled"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); From d3a4da5cf663a37d77af4670bdad85e06b32fae3 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 17 Dec 2018 18:53:44 +0800 Subject: [PATCH 387/719] fix comment test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 59a0aef480..6bca299813 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -109,7 +109,7 @@ void AllReduceOpHandle::RunImpl() { buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); // TODO(Yancey1989): synchronize here can get better performance - // if don't use NCCL group call, but need more profileing. + // if don't use NCCL group call, but need more profiling. if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream); }); } From 1a6d2cfe395fdc99b919513d45a042e119b92e11 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 17 Dec 2018 19:04:09 +0800 Subject: [PATCH 388/719] add test_analyzer_mobilenet test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 9 ++++++++- .../inference/tests/api/analyzer_vis_tester.cc | 14 -------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 8a4bc04b67..5862fedb9a 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -85,12 +85,19 @@ if (NOT EXISTS ${OCR_INSTALL_DIR}) endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +# mobilenet with transpose +set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") +if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) + inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") +endif() +inference_analysis_api_test(test_analyzer_mobilenet ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) + # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") # mobilenet with depthwise_conv op -inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet +inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") # anakin diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index adaa338e28..4700afdc86 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -93,20 +93,6 @@ void profile(bool use_mkldnn = false) { SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { - const float ocr_result_data[] = { - 5.273636460856323538e-08, 3.296741795111302054e-07, - 1.873261190610264748e-08, 3.403730275408634043e-08, - 3.383312474625199684e-08}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); - PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); - for (size_t i = 0; i < std::min(5UL, size); i++) { - EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3); - } - } } TEST(Analyzer_vis, profile) { profile(); } From addded48e1c94e0299774012399bf4f4ab773544 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 17 Dec 2018 19:31:25 +0800 Subject: [PATCH 389/719] test=develop (#14898) --- paddle/fluid/operators/distributed/grpc_server.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index c3974138f4..cda102e78d 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -488,7 +488,7 @@ void AsyncGRPCServer::HandleRequest( while (true) { VLOG(4) << "HandleRequest " << rpc_name << " wait next"; if (!cq->Next(&tag, &ok)) { - VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!"; + LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!"; break; } @@ -511,9 +511,8 @@ void AsyncGRPCServer::HandleRequest( // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { - LOG(WARNING) << "completion queue:" << rpc_name - << " recv no regular event" - << " context:" << base->Status2String(rpc_name); + VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" + << " context:" << base->Status2String(rpc_name); TryToRegisterNewOne(rpc_name, req_id); delete base; continue; From 1141db811455eadc6b44bbb3785b0510f1f51870 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 19:32:32 +0800 Subject: [PATCH 390/719] update test_adam_op test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 1 + .../fluid/tests/unittests/test_adam_op.py | 30 ++++++++++++------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 01d3d60054..8fc6689ff1 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -358,6 +358,7 @@ class AdamOpKernel : public framework::OpKernel { lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); + VLOG(3) << "lazy_mode :" << lazy_mode; if (lazy_mode) { std::vector id_vector; size_t row_count = grad_merge.rows().size(); diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 461196689c..ff7fc5100e 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -219,14 +219,25 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, moment2_out = np.zeros(shape=[height, row_numel]) param_out = np.zeros(shape=[height, row_numel]) - for idx, row_id in enumerate(rows): + def update_row(row_id, update_value): moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 - ) * np_grad[idx] + ) * update_value moment2_out[row_id] = beta2 * moment2[row_id] + ( - 1 - beta2) * np.square(np_grad[idx]) + 1 - beta2) * np.square(update_value) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / ( np.sqrt(moment2_out[row_id]) + epsilon)) + + if lazy_mode: + for idx, row_id in enumerate(rows): + update_row(row_id, np_grad[idx]) + else: + for row_id in range(param_out.shape[0]): + update_value = np.zeros(np_grad[0].shape).astype("float32") + if row_id in rows: + update_value = np_grad[rows.index(row_id)] + update_row(row_id, update_value) + return param_out, moment1_out, moment2_out @@ -249,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase): 'Beta2Pow': np.array([beta2**10]).astype("float32"), "LearningRate": np.full((1), 2.0).astype("float32") } + self.init_output = np.full((height, row_numel), 0.0).astype("float32") self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} grad_selected_rows = scope.var('Grad').get_selected_rows() @@ -286,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase): op_args[s] = s for s in self.outputs: var = scope.var(s).get_tensor() - var.set(self.outputs[s], place) + var.set(self.init_output, place) op_args[s] = s for k in self.attrs: op_args[k] = self.attrs[k] @@ -300,13 +312,9 @@ class TestSparseAdamOp(unittest.TestCase): actual = np.array(out_var) actual = actual.reshape([actual.size]) np_array = np_array.reshape([np_array.size]) - for idx, row_id in enumerate(self.rows): - j = 0 - while j < self.row_numel: - pos = row_id * self.row_numel + j - self.assertLess((actual[pos] - np_array[pos]) / actual[pos], - 0.00001) - j += 1 + + for i in range(np_array.size): + self.assertLess((actual[i] - np_array[i]), 0.00001) def test_sparse_adam(self): places = [core.CPUPlace()] From 64a90b2f1c762dc4a093da413b9c945c99b82e73 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 12:29:22 +0000 Subject: [PATCH 391/719] use vadd, vaddrelu, lstm and gru jitkernel --- paddle/fluid/operators/fused/fusion_gru_op.cc | 58 ++++++++--------- .../fluid/operators/fused/fusion_lstm_op.cc | 62 ++++++++++--------- paddle/fluid/operators/math/CMakeLists.txt | 9 --- paddle/fluid/operators/math/fc_compute.h | 14 ++--- 4 files changed, 68 insertions(+), 75 deletions(-) diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 25b7ae7c28..d44a7ad83e 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_gru_op.h" #include // for memcpy #include +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/fc_compute.h" -#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { @@ -183,27 +183,29 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const math::jitkernel::gru_attr_t attr( \ - D, ctx.Attr("gate_activation"), \ - ctx.Attr("activation")); \ - math::jitkernel::gru_t one_step; \ - const auto& ker = \ - math::jitkernel::KernelPool::Instance() \ - .template Get, \ - const math::jitkernel::gru_attr_t&>(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::Get(attr); \ + auto ComputeHtPart1 = \ + jit::Get(attr); \ + auto ComputeHtPart2 = \ + jit::Get(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { @@ -242,7 +244,7 @@ class FusionGRUKernel : public framework::OpKernel { } else { one_step.gates = xx_data; one_step.ht = hidden_out_data; - ker->ComputeH1(&one_step, &attr); + ComputeH1(&one_step, &attr); prev_hidden_data = hidden_out_data; tstart = 1; move_step(); @@ -255,12 +257,12 @@ class FusionGRUKernel : public framework::OpKernel { one_step.gates = xx_data; one_step.ht_1 = prev_hidden_data; one_step.ht = hidden_out_data; - ker->ComputeHtPart1(&one_step, &attr); + ComputeHtPart1(&one_step, &attr); // gemm rt * Ws blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast(1), hidden_out_data, D, wh_state_data, D, static_cast(1), xx_data + D2, D3); - ker->ComputeHtPart2(&one_step, &attr); + ComputeHtPart2(&one_step, &attr); // save prev prev_hidden_data = hidden_out_data; move_step(); @@ -324,7 +326,7 @@ class FusionGRUKernel : public framework::OpKernel { for (int i = 0; i < max_bs; ++i) { one_step.gates = cur_in_data; one_step.ht = cur_out_data; - ker->ComputeH1(&one_step, &attr); + ComputeH1(&one_step, &attr); // add offset cur_in_data += D3; cur_out_data += D; @@ -352,7 +354,7 @@ class FusionGRUKernel : public framework::OpKernel { one_step.gates = cur_batched_data; one_step.ht_1 = cur_prev_hidden_data; one_step.ht = cur_out_data; - ker->ComputeHtPart1(&one_step, &attr); + ComputeHtPart1(&one_step, &attr); cur_batched_data += D3; cur_prev_hidden_data += D; @@ -370,7 +372,7 @@ class FusionGRUKernel : public framework::OpKernel { one_step.gates = cur_batched_data; one_step.ht_1 = cur_prev_hidden_data; one_step.ht = cur_out_data; - ker->ComputeHtPart2(&one_step, &attr); + ComputeHtPart2(&one_step, &attr); cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 8021a896ce..a62f4d18c2 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_lstm_op.h" #include +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/fc_compute.h" -#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { @@ -236,31 +236,33 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const math::jitkernel::lstm_attr_t attr( \ - D, ctx.Attr("gate_activation"), \ - ctx.Attr("candidate_activation"), \ - ctx.Attr("cell_activation"), use_peepholes); \ - math::jitkernel::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - const auto& ker = \ - math::jitkernel::KernelPool::Instance() \ - .template Get, \ - const math::jitkernel::lstm_attr_t&>(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit \ + : lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + math::jitkernel::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::Get(attr); \ + auto ComputeCtHt = \ + jit::Get(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ @@ -306,7 +308,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.gates = xx_data; one_step.ct = c_out_data; one_step.ht = h_out_data; - ker->ComputeC1H1(&one_step, &attr); + ComputeC1H1(&one_step, &attr); tstart = 1; // move one step prev_h_data = h_out_data; @@ -322,7 +324,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.ct_1 = prev_c_data; one_step.ct = c_out_data; one_step.ht = h_out_data; - ker->ComputeCtHt(&one_step, &attr); + ComputeCtHt(&one_step, &attr); // move one step prev_h_data = h_out_data; prev_c_data = c_out_data; @@ -402,7 +404,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.gates = cur_in_data; one_step.ct = cur_c_out_data; one_step.ht = cur_h_out_data; - ker->ComputeC1H1(&one_step, &attr); + ComputeC1H1(&one_step, &attr); cur_in_data += D4; cur_c_out_data += D; @@ -432,7 +434,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.ct_1 = cur_prev_c_data; one_step.ct = cur_c_out_data; one_step.ht = cur_h_out_data; - ker->ComputeCtHt(&one_step, &attr); + ComputeC1H1(&one_step, &attr); // move one batch cur_in_data += D4; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 8e8f83a635..ea6aebd291 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -73,12 +73,3 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) - -# set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc) -# set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) -# if(WITH_XBYAK) -# list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) -# list(APPEND JIT_KERNEL_DEPS xbyak) -# endif() -# cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) -# cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 5b9953a5aa..5e3093c69d 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/jit_kernel.h" namespace paddle { namespace operators { @@ -30,22 +30,20 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - const auto& vaddrelu = jitkernel::KernelPool::Instance() - .template Get>(N); + auto compute = + jit::Get(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; - vaddrelu->Compute(B, dst, dst, N); + compute(B, dst, dst, N); } } else { - const auto& vadd = jitkernel::KernelPool::Instance() - .template Get>(N); - + auto compute = jit::Get(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif for (int i = 0; i < M; i++) { T* dst = Y + i * N; - vadd->Compute(B, dst, dst, N); + compute(B, dst, dst, N); } } } From 720b55cbcff16832671873e409b5aa41b1cec1ef Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 12:30:18 +0000 Subject: [PATCH 392/719] enable crf decoding and layer norm refer code --- paddle/fluid/operators/crf_decoding_op.h | 9 +-- paddle/fluid/operators/jit/helper.cc | 4 + paddle/fluid/operators/jit/kernel_base.h | 19 ++++- .../fluid/operators/jit/refer/CMakeLists.txt | 2 + paddle/fluid/operators/jit/refer/refer.cc | 3 + paddle/fluid/operators/jit/refer/refer.h | 80 +++++++++++++++++++ paddle/fluid/operators/jit/test.cc | 2 +- paddle/fluid/operators/layer_norm_op.h | 14 ++-- 8 files changed, 119 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index e9d2e84a43..860d71e1fe 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/math_function.h" namespace paddle { @@ -82,10 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - const auto& ker = math::jitkernel::KernelPool::Instance() - .template Get>( - static_cast(tag_num)); - ker->Compute(static_cast(seq_len), x, w, alpha_value, track_value); + auto ker = jit::Get( + tag_num); + ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; for (size_t i = 0; i < tag_num; ++i) { diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 0543b0743c..a0ff82043f 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -42,6 +42,8 @@ const char* to_string(KernelType kt) { ONE_CASE(gruh1); ONE_CASE(gruhtpart1); ONE_CASE(gruhtpart2); + ONE_CASE(crfdecoding); + ONE_CASE(layernorm); default: PADDLE_THROW("Not support type: %d", kt); return "NOT JITKernel"; @@ -64,6 +66,8 @@ KernelType to_kerneltype(const std::string& act) { } else if (lower == "tanh" || lower == "vtanh") { return vtanh; } + PADDLE_THROW("Not support type: %s, or forget to add this case", act); + return non_kernel; } diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index f10d9f3fdd..59531c2f17 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -37,7 +37,9 @@ typedef enum { lstmc1h1, gruh1, gruhtpart1, - gruhtpart2 + gruhtpart2, + crfdecoding, + layernorm } KernelType; template @@ -109,6 +111,21 @@ struct GRUTuples { typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +template +struct CRFDecodingTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); +}; + +template +struct LayerNormTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int, + const float, int); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 78d1cb8f9a..f3a0e9b11f 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -23,3 +23,5 @@ USE_JITKERNEL_REFER(lstmc1h1) USE_JITKERNEL_REFER(gruh1) USE_JITKERNEL_REFER(gruhtpart1) USE_JITKERNEL_REFER(gruhtpart2) +USE_JITKERNEL_REFER(crfdecoding) +USE_JITKERNEL_REFER(layernorm) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index c99174a66f..00daa0d478 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -42,4 +42,7 @@ REGISTER_REFER_KERNEL(gruh1, GRUH1); REGISTER_REFER_KERNEL(gruhtpart1, GRUHtPart1); REGISTER_REFER_KERNEL(gruhtpart2, GRUHtPart2); +REGISTER_REFER_KERNEL(crfdecoding, CRFDecoding); +REGISTER_REFER_KERNEL(layernorm, LayerNorm); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index a9a6ffbccd..5780ea05bd 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -13,6 +13,9 @@ * limitations under the License. */ #pragma once + +#include +#include #include "paddle/fluid/operators/jit/helper.h" #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/enforce.h" @@ -242,6 +245,80 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { } } +template +void CRFDecoding(const int seq_len, const T* x, const T* w, T* alpha, + int* track, int right) { + constexpr int state_trans_base_idx = 2; + for (int i = 0; i < right; ++i) { + alpha[i] = w[i] + x[i]; + } + for (int k = 1; k < seq_len; ++k) { + for (int i = 0; i < right; ++i) { + T max_score = -std::numeric_limits::max(); + int max_j = 0; + for (int j = 0; j < right; ++j) { + T score = alpha[(k - 1) * right + j] + + w[(j + state_trans_base_idx) * right + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + alpha[k * right + i] = max_score + x[k * right + i]; + track[k * right + i] = max_j; + } + } +} + +template +void LayerNorm(T* x, T* out, T* mean, T* var, const T* scale, const T* bias, + int height, const float epsilon, int right) { + // get mean + for (int i = 0; i < height; i++) { + T sum = 0.0; + int offset = i * right; + for (int j = 0; j < right; j++) { + sum += x[offset + j]; + } + mean[i] = sum / right; + } + + // get variance + for (int i = 0; i < height; i++) { + T sum = 0.0; + int offset = i * right; + for (int j = 0; j < right; j++) { + sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]); + } + var[i] = sum / right; + } + + for (int i = 0; i < height; i++) { + int offset = i * right; + T sqrt_var = std::sqrt(var[i] + (T)epsilon); + for (int j = 0; j < right; j++) { + out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var; + } + } + if (scale) { + for (int i = 0; i < height; i++) { + int offset = i * right; + for (int j = 0; j < right; j++) { + out[offset + j] *= scale[j]; + } + } + } + + if (bias) { + for (int i = 0; i < height; i++) { + int offset = i * right; + for (int j = 0; j < right; j++) { + out[offset + j] += bias[j]; + } + } + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -275,6 +352,9 @@ DECLARE_REFER_KERNEL(GRUH1, GRUTuples); DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); +DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); +DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 36f8eb6e7b..85eadea751 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -515,7 +515,7 @@ TEST(JITKernel, gruhtpart2) { TestGRUKernel(); } -// TODO(TJ): refine the tests template +// TODO(yihua/TJ): add crf decoding and layer norm unit tests TEST(JITKernel, pool) { // TODO(TJ): add some test diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index 78d20ddf5f..bb00ed4729 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ !defined(__OSX__) -#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/jit/kernels.h" #endif #include "paddle/fluid/operators/math/math_function.h" @@ -229,12 +229,12 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(scale->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right); - const auto& ker = math::jitkernel::KernelPool::Instance() - .template Get>( - static_cast(right)); - ker->Compute(x.data(), out.data(), mean->data(), var->data(), - scale->data(), bias->data(), static_cast(left), - static_cast(epsilon)); + auto ker = + jit::Get( + right); + ker(x.data(), out.data(), mean->data(), var->data(), + scale->data(), bias->data(), static_cast(left), + static_cast(epsilon), right); #endif } }; From 050a68dde38611b226207ae5840e8009ff44bc2a Mon Sep 17 00:00:00 2001 From: nhzlx Date: Mon, 17 Dec 2018 13:19:59 +0000 Subject: [PATCH 393/719] fix comments test=develop --- .../inference/analysis/ir_passes/tensorrt_subgraph_pass.cc | 3 --- paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc | 1 - paddle/fluid/operators/tensorrt/tensorrt_engine_op.h | 2 -- 3 files changed, 6 deletions(-) diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 4ffe5f575c..9c42b83e7a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -63,7 +63,6 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); - static int counter{0}; auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, block_desc.Proto()->SerializeAsString()); SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "engine_uniq_key", - "trt-" + std::to_string(counter++)); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index f1ab59e397..b993c55fad 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -29,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); - AddAttr("engine_uniq_key", "unique key for the TRT engine."); AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); AddComment("TensorRT engine operator."); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index c19c315f79..88c4f50847 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -65,7 +65,6 @@ using inference::tensorrt::TensorRTEngine; class TensorRTEngineOp : public framework::OperatorBase { private: - std::string engine_name_; std::vector input_names_; std::unordered_set param_names_; mutable std::unique_ptr trt_engine_; @@ -78,7 +77,6 @@ class TensorRTEngineOp : public framework::OperatorBase { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : framework::OperatorBase(type, inputs, outputs, attrs) { - engine_name_ = Attr("engine_uniq_key"); input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); From 2373aeb5e845823ef30b0bfd303e93aa00798295 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 17 Dec 2018 21:41:08 +0800 Subject: [PATCH 394/719] fix bug test=develop --- paddle/fluid/platform/stream_callback_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 466c77469e..5a9e24374f 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -19,7 +19,7 @@ namespace paddle { namespace platform { #if CUDA_VERSION >= 10000 -static void CUDART_CB StreamCallbackFunc(void *user_data); +static void CUDART_CB StreamCallbackFunc(void *user_data) #else static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data) From fd152289fa694b99704e4821a71b0c1f160896aa Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 22:14:11 +0800 Subject: [PATCH 395/719] clean for range in test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 14 +++--- paddle/fluid/platform/for_range.h | 52 --------------------- 2 files changed, 6 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 8fc6689ff1..4f212bb69a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -227,8 +227,10 @@ struct SparseAdamFunctor { inline HOSTDEVICE void operator()(size_t i) const { auto row_idx = math::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; - adam_update(i, g); + if (!(lazy_mode_ && row_idx < 0)) { + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + adam_update(i, g); + } } }; @@ -359,19 +361,15 @@ class AdamOpKernel : public framework::OpKernel { param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); VLOG(3) << "lazy_mode :" << lazy_mode; - if (lazy_mode) { - std::vector id_vector; + if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) { size_t row_count = grad_merge.rows().size(); std::vector cpu_rows(grad_merge.rows()); for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { size_t i = cpu_rows[row_index] * row_numel + offset; - id_vector.push_back(i); + functor.adam_update(i, grad_data[row_index * row_numel + offset]); } } - platform::ForRangeIn for_range_in( - static_cast(ctx.device_context()), id_vector); - for_range_in(functor); } else { platform::ForRange for_range( static_cast(ctx.device_context()), diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index ab00d8b8f5..910d1669f2 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -22,29 +22,6 @@ limitations under the License. */ namespace paddle { namespace platform { -template -struct ForRangeIn { - ForRangeIn(const DeviceContext& dev_ctx, std::vector range); - - template - void operator()(Function func) const; -}; - -template <> -struct ForRangeIn { - ForRangeIn(const CPUDeviceContext& dev_ctx, std::vector range) - : range_(range) {} - - template - void operator()(Function func) const { - for (auto i : range_) { - func(i); - } - } - - std::vector range_; -}; - template struct ForRange { ForRange(const DeviceContext& dev_ctx, size_t limit); @@ -106,35 +83,6 @@ struct ForRange { int limit_; }; -template -__global__ static void ForRangeInElemwiseOp(Function func, T* vector, - int vector_size) { - size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (idx < vector_size) { - func(vector[idx]); - } -} - -template <> -struct ForRangeIn { - ForRangeIn(const CUDADeviceContext& dev_ctx, std::vector range) - : dev_ctx_(dev_ctx), range_(range) {} - - template - inline void operator()(Function func) const { - constexpr int num_threads = 1024; - int range_size = range_.size(); - int block_size = range_size <= num_threads ? range_size : num_threads; - int grid_size = (range_.size() + num_threads - 1) / num_threads; - - ForRangeInElemwiseOp<<>>( - func, range_.CUDAData(dev_ctx_.GetPlace()), range_size); - } - - const CUDADeviceContext& dev_ctx_; - framework::Vector range_; -}; - #endif } // namespace platform From 56686d0f34db008238095331b6f981d8f4e5d3d4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 17 Dec 2018 22:16:52 +0800 Subject: [PATCH 396/719] clean code test=develop --- paddle/fluid/platform/for_range.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index 910d1669f2..c153e80fe4 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,10 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include - -#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { From 74292f414c033bf7cb53b4f87a82f7ff6c18a4b2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 14:51:52 +0000 Subject: [PATCH 397/719] enable eltwise nchw16c mul nc --- .../elementwise/elementwise_mul_mkldnn_op.cc | 10 ++- paddle/fluid/operators/jit/gen/CMakeLists.txt | 1 + paddle/fluid/operators/jit/gen/blas.cc | 43 +++++++++++++ paddle/fluid/operators/jit/gen/blas.h | 12 ++++ paddle/fluid/operators/jit/helper.cc | 3 +- paddle/fluid/operators/jit/helper.h | 1 + paddle/fluid/operators/jit/kernel_base.h | 11 +++- .../fluid/operators/jit/refer/CMakeLists.txt | 1 + paddle/fluid/operators/jit/refer/refer.cc | 2 + paddle/fluid/operators/jit/refer/refer.h | 15 +++++ paddle/fluid/operators/jit/test.cc | 62 +++++++++++++++++++ 11 files changed, 153 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index c600d1e3d7..71f4b71330 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" -#include "paddle/fluid/operators/math/jit_kernel.h" +#include "paddle/fluid/operators/jit/kernels.h" #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" @@ -108,10 +108,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - const auto& multiply = - math::jitkernel::KernelPool::Instance() - .template Get>(n); - + auto multiply = jit::Get(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { @@ -122,7 +120,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { auto ptr_z = z_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - multiply->Compute(ptr_x, ptr_y, ptr_z, h, w); + multiply(ptr_x, ptr_y, ptr_z, h, w); } } } diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 8ad9587b5e..a7f9e18318 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -25,3 +25,4 @@ USE_JITKERNEL_GEN(lstmc1h1) USE_JITKERNEL_GEN(gruh1) USE_JITKERNEL_GEN(gruhtpart1) USE_JITKERNEL_GEN(gruhtpart2) +USE_JITKERNEL_GEN(nchw16cmulnc) diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index b24f44c9f3..65b9a52ff2 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -104,6 +104,48 @@ void VXXJitCode::genCode() { ret(); } +void NCHW16CMulNCJitCode::genCode() { + // RDI is ptr x_input + // RSI is ptr y_input + // RDX is ptr output + // RCX is height + // r8 is width + + push(rbx); + + xor_(rax, rax); + xor_(r10, r10); + vmovups(zmm3, ptr[rsi]); + + L("h_loop"); + xor_(rbx, rbx); + L("w_loop"); + vmovups(zmm2, ptr[rdi + rax]); + vmulps(zmm1, zmm2, zmm3); + vmovups(ptr[rdx + rax], zmm1); + add(rax, 64); + inc(rbx); + cmp(r8, rbx); + jnz("w_loop"); + inc(r10); + cmp(r10, rcx); + jnz("h_loop"); + + pop(rbx); + ret(); +} + +class NCHW16CMulNCCreator : public JitCodeCreator { + public: + bool UseMe(const int& attr) const override { + return platform::MayIUse(platform::avx512f); + } + size_t CodeSize(const int& d) const override { return 256 * 1024; } + std::unique_ptr CreateJitCode(const int& attr) const override { + return make_unique(attr, CodeSize(attr)); + } +}; + #define DECLARE_BLAS_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ @@ -141,3 +183,4 @@ REGISTER_JITKERNEL_GEN(vadd, gen::VAddCreator); REGISTER_JITKERNEL_GEN(vaddrelu, gen::VAddReluCreator); REGISTER_JITKERNEL_GEN(vscal, gen::VScalCreator); REGISTER_JITKERNEL_GEN(vaddbias, gen::VAddBiasCreator); +REGISTER_JITKERNEL_GEN(nchw16cmulnc, gen::NCHW16CMulNCCreator); diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index 5a2192052f..29be4e7358 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -99,6 +99,18 @@ DECLARE_BLAS_JITCODE(VAddBias, operand_type::add, 1, false); #undef DECLARE_BLAS_JITCODE +// nChw16c = nChw16c .* NC +class NCHW16CMulNCJitCode : public JitCode { + public: + DECLARE_JIT_CODE(NCHW16CMulNCJitCode); + explicit NCHW16CMulNCJitCode(int d /*unused*/, size_t code_size, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr) { + this->genCode(); + } + void genCode() override; +}; + } // namespace gen } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index a0ff82043f..a1bb51fa66 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -44,8 +44,9 @@ const char* to_string(KernelType kt) { ONE_CASE(gruhtpart2); ONE_CASE(crfdecoding); ONE_CASE(layernorm); + ONE_CASE(nchw16cmulnc); default: - PADDLE_THROW("Not support type: %d", kt); + PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; } return nullptr; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 44952fb907..275170ca2b 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -93,6 +93,7 @@ inline typename KernelTuples::func_type GetRefer() { template +// TODO(TJ): const & attr typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { auto jitfunc = GetJitCode(attr); if (jitfunc) { diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 59531c2f17..9ba0a95831 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -39,7 +39,8 @@ typedef enum { gruhtpart1, gruhtpart2, crfdecoding, - layernorm + layernorm, + nchw16cmulnc, } KernelType; template @@ -126,6 +127,14 @@ struct LayerNormTuples { const float, int); }; +// nChw16c = nChw16c .* NC +template +struct NCHW16CMulNCTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, const T*, T*, int, int); +}; + // Just for adding to kernel pool without template class Kernel { public: diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index f3a0e9b11f..86432bfffe 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -25,3 +25,4 @@ USE_JITKERNEL_REFER(gruhtpart1) USE_JITKERNEL_REFER(gruhtpart2) USE_JITKERNEL_REFER(crfdecoding) USE_JITKERNEL_REFER(layernorm) +USE_JITKERNEL_REFER(nchw16cmulnc) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 00daa0d478..1aee6ff950 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -45,4 +45,6 @@ REGISTER_REFER_KERNEL(gruhtpart2, GRUHtPart2); REGISTER_REFER_KERNEL(crfdecoding, CRFDecoding); REGISTER_REFER_KERNEL(layernorm, LayerNorm); +REGISTER_REFER_KERNEL(nchw16cmulnc, NCHW16CMulNC); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 5780ea05bd..6f72c2b724 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -319,6 +319,19 @@ void LayerNorm(T* x, T* out, T* mean, T* var, const T* scale, const T* bias, } } +template +void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { + int offset = 0; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int i = 0; i < 16; ++i) { + z[i + offset] = y[i] * x[i + offset]; + } + offset += ZMM_FLOAT_BLOCK; + } + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -355,6 +368,8 @@ DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); +DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 85eadea751..32937d9c00 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -19,6 +19,7 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" template @@ -414,6 +415,59 @@ void TestGRUKernel() { } } +template +void TestNCHW16CMulNCKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + const int n = 3, c = 16 * 4, h = 10, w = 10; + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + int sz = n * c * h * w; + std::vector x(sz), y(n * c), zref(sz); + std::vector ztgt(sz), zjit(sz); + RandomVec(sz, x.data(), -2.f, 2.f); + RandomVec(n * c, y.data(), -2.f, 2.f); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* ztgt_data = ztgt.data(); + T* zjit_data = zjit.data(); + constexpr int simd_width = ZMM_FLOAT_BLOCK; + int C = c / simd_width; + auto tgt = jit::Get, PlaceType>(0); + auto jitcode = jit::GetJitCode, PlaceType>(0); + EXPECT_TRUE(tgt != nullptr); + + if (std::is_same::value && + paddle::platform::MayIUse(paddle::platform::avx512f)) { + EXPECT_TRUE(jitcode != nullptr); + } + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < C; ci++) { + auto ptr_x = + x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; + auto ptr_zref = + zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_ztgt = + ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + ref(ptr_x, ptr_y, ptr_zref, h, w); + tgt(ptr_x, ptr_y, ptr_ztgt, h, w); + + if (jitcode) { + auto ptr_zjit = + zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + jitcode(ptr_x, ptr_y, ptr_zjit, h, w); + } + } + } + ExpectEQ(ztgt_data, zref_data, sz); + if (jitcode) { + ExpectEQ(zjit_data, zref_data, sz); + } +} + // XYZNTuple TEST(JITKernel, vmul) { namespace jit = paddle::operators::jit; @@ -515,6 +569,14 @@ TEST(JITKernel, gruhtpart2) { TestGRUKernel(); } +TEST(JITKernel, nchw16cmulnc) { + namespace jit = paddle::operators::jit; + TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); +} + // TODO(yihua/TJ): add crf decoding and layer norm unit tests TEST(JITKernel, pool) { From fe3995d33527e8503739b6de3dd555fa3ad35073 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 18 Dec 2018 07:15:42 +0800 Subject: [PATCH 398/719] refine code test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 4f212bb69a..f214d8272f 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -227,7 +227,9 @@ struct SparseAdamFunctor { inline HOSTDEVICE void operator()(size_t i) const { auto row_idx = math::BinarySearch(rows_, row_count_, i / row_numel_); - if (!(lazy_mode_ && row_idx < 0)) { + if (lazy_mode_ && row_idx < 0) { + return; + } else { T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; adam_update(i, g); } From 52bc4ee75adf64e449dfdbbdbbe3e41cdc593bdc Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 20:27:17 +0800 Subject: [PATCH 399/719] delay infer scope test=develop --- paddle/fluid/framework/operator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a62afe248b..86e1713b02 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -703,8 +703,6 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -758,6 +756,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope); + this->InferShape(&infer_shape_ctx); kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); if (!transfered_inplace_vars.empty()) { From bbff0df320f0f68634a5ae3c4d9507b52a1134f7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 21:49:25 +0800 Subject: [PATCH 400/719] try cache variables test=develop --- paddle/fluid/framework/ngraph_operator.cc | 15 +++++++- paddle/fluid/framework/operator.cc | 47 ++++++++++++++++------- paddle/fluid/framework/operator.h | 22 ++++++++--- paddle/fluid/framework/type_defs.h | 3 ++ 4 files changed, 66 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e2cdfc845f..e37f0915c5 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,7 +278,20 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - op->RuntimeInferShape(scope_, place_); + RuntimeContext ctx; + for (auto& var_name_item : op->Inputs()) { + std::vector input_vars = ctx.inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope_.FindVar(var_name)); + } + } + for (auto& var_name_item : op->Outputs()) { + std::vector output_vars = ctx.outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope_.FindVar(var_name)); + } + } + op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope_.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 86e1713b02..79e3d29a63 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -477,23 +477,22 @@ bool OpSupportGPU(const std::string& op_type) { class RuntimeInferShapeContext : public InferShapeContext { public: - RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) - : op_(op), scope_(scope) {} + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, + const RuntimeContext& ctx) + : op_(op), scope_(scope), ctx_(ctx) {} bool HasInput(const std::string& name) const override { // has only one input - const auto& ins = op_.Inputs(); + const auto& ins = ctx_.inputs; auto it = ins.find(name); if (it == ins.end()) { return false; } const auto& in = it->second; - if (in.size() == 0 || in[0] == kEmptyVarName) { - return false; - } + if (in.size() == 0) return false; PADDLE_ENFORCE_EQ(in.size(), 1UL, "Input %s should not have more than one inputs", name); - return scope_.FindVar(in[0]) != nullptr; + return in[0] != nullptr; } bool HasOutput(const std::string& name) const override { @@ -678,6 +677,7 @@ class RuntimeInferShapeContext : public InferShapeContext { private: const OperatorBase& op_; const Scope& scope_; + const RuntimeContext& ctx_; }; static void CheckTensorNANOrInf(const std::string& name, @@ -696,8 +696,9 @@ static void CheckTensorNANOrInf(const std::string& name, } void OperatorWithKernel::RuntimeInferShape(const Scope& scope, - const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); + const platform::Place& place, + const RuntimeContext& ctx) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); this->InferShape(&infer_shape_ctx); } @@ -743,10 +744,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } + RuntimeContext ctx; // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = @@ -756,7 +758,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope); + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); @@ -797,13 +799,20 @@ void OperatorWithKernel::TransferInplaceVarsBack( } } -Scope* OperatorWithKernel::TryTransferData( +Scope* OperatorWithKernel::PrepareData( const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const { + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const { Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { - for (auto& var_name : var_name_item.second) { + std::vector& input_vars = ctx->inputs[var_name_item.first]; + input_vars.resize(var_name_item.second.size()); + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); + input_vars[i] = var; + // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { continue; @@ -851,12 +860,22 @@ Scope* OperatorWithKernel::TryTransferData( } auto* trans_var = new_scope->Var(var_name); + input_vars[i] = var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); SetTensorToVariable(*var, out, trans_var); } } + for (auto& var_name_item : Outputs()) { + std::vector& output_vars = ctx->outputs[var_name_item.first]; + output_vars.resize(var_name_item.second.size()); + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; + output_vars[i] = scope.FindVar(var_name); + } + } return new_scope; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bc..438ae25398 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -70,6 +70,14 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); class OperatorBase; class ExecutionContext; +class RuntimeContext { + public: + RuntimeContext() {} + + VariableValueMap inputs; + VariableValueMap outputs; +}; + /** * OperatorBase has the basic elements that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -129,7 +137,8 @@ class OperatorBase { void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } virtual void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const {} + const platform::Place& place, + const RuntimeContext& ctx) const {} protected: std::string type_; @@ -350,8 +359,8 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } - void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const override; + void RuntimeInferShape(const Scope& scope, const platform::Place& place, + const RuntimeContext& ctx) const override; protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; @@ -371,9 +380,10 @@ class OperatorWithKernel : public OperatorBase { * * * transfered_inplace_vars is a output vector. */ - Scope* TryTransferData( - const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const; + Scope* PrepareData(const Scope& scope, + const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const; void TransferInplaceVarsBack(const Scope& scope, const std::vector& inplace_vars, diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 2de6233a9e..938e2024c3 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -28,8 +28,11 @@ class OperatorBase; class OpDesc; class InferShapeContext; class BlockDesc; +class Variable; using VariableNameMap = std::map>; +// TODO(panyx0718): Replace vector with something like gtl::Vector. +using VariableValueMap = std::map>; // The order should be as same as framework.proto using Attribute = From 840e6729e224d867386bdfc9ff12af4b71ee7188 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 17 Dec 2018 21:27:56 +0800 Subject: [PATCH 401/719] inject context test=develop --- paddle/fluid/framework/ngraph_operator.cc | 14 +------- paddle/fluid/framework/operator.cc | 36 +++++++++++-------- paddle/fluid/framework/operator.h | 9 +++-- .../fluid/operators/beam_search_decode_op.cc | 3 +- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e37f0915c5..23f681ce88 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,19 +278,7 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - RuntimeContext ctx; - for (auto& var_name_item : op->Inputs()) { - std::vector input_vars = ctx.inputs[var_name_item.first]; - for (auto& var_name : var_name_item.second) { - input_vars.push_back(scope_.FindVar(var_name)); - } - } - for (auto& var_name_item : op->Outputs()) { - std::vector output_vars = ctx.outputs[var_name_item.first]; - for (auto& var_name : var_name_item.second) { - output_vars.push_back(scope_.FindVar(var_name)); - } - } + RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 79e3d29a63..461d357527 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } } +RuntimeContext::RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) { + for (auto& var_name_item : innames) { + std::vector& input_vars = inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope.FindVar(var_name)); + } + } + for (auto& var_name_item : outnames) { + std::vector& output_vars = outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope.FindVar(var_name)); + } + } +} + void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(4) << place << " " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { @@ -704,6 +721,7 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { + RuntimeContext ctx(Inputs(), Outputs(), scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -717,15 +735,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. - - // for (auto& candidate : kKernelPriority) { - // Do selection - // } - - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + auto expected_kernel_key = this->GetExpectedKernelType( + ExecutionContext(*this, scope, *dev_ctx, ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -744,7 +755,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } - RuntimeContext ctx; // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = @@ -760,7 +770,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. @@ -784,6 +794,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } + void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { @@ -806,7 +817,6 @@ Scope* OperatorWithKernel::PrepareData( Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { std::vector& input_vars = ctx->inputs[var_name_item.first]; - input_vars.resize(var_name_item.second.size()); for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; @@ -869,8 +879,6 @@ Scope* OperatorWithKernel::PrepareData( } for (auto& var_name_item : Outputs()) { std::vector& output_vars = ctx->outputs[var_name_item.first]; - output_vars.resize(var_name_item.second.size()); - for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 438ae25398..e359414d15 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -72,7 +72,8 @@ class ExecutionContext; class RuntimeContext { public: - RuntimeContext() {} + RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, const Scope& scope); VariableValueMap inputs; VariableValueMap outputs; @@ -165,8 +166,9 @@ class OperatorBase { class ExecutionContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, - const platform::DeviceContext& device_context) - : op_(op), scope_(scope), device_context_(device_context) {} + const platform::DeviceContext& device_context, + const RuntimeContext& ctx) + : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {} const OperatorBase& op() const { return op_; } @@ -295,6 +297,7 @@ class ExecutionContext { const OperatorBase& op_; const Scope& scope_; const platform::DeviceContext& device_context_; + const RuntimeContext& ctx_; }; template <> diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index ae9765b761..7f2bde55c9 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(dev_place); - framework::ExecutionContext ctx(*this, scope, dev_ctx); + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); const LoDTensorArray* ids = ctx.Input("Ids"); const LoDTensorArray* scores = ctx.Input("Scores"); From eaf8ba35b519b780629a7108d08ffd3895ac18fe Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 09:42:57 +0800 Subject: [PATCH 402/719] change input test=develop --- paddle/fluid/framework/operator.cc | 50 ++++++++++++++++++++++++++++++ paddle/fluid/framework/operator.h | 33 +++++++++++++++----- paddle/fluid/operators/prelu_op.cc | 2 +- 3 files changed, 76 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 461d357527..87f61f3afc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -143,12 +143,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { + LOG(ERROR) << "first in " << var_name_item.first << ":" << var_name; input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { + LOG(ERROR) << "first out " << var_name_item.first << ":" << var_name; output_vars.push_back(scope.FindVar(var_name)); } } @@ -429,11 +431,52 @@ bool ExecutionContext::HasOutput(const std::string& name) const { return var != nullptr; } +const Variable* ExecutionContext::InputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::OutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + +const Variable* ExecutionContext::FastInputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::FastOutputVar(const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } +template <> +const Tensor* ExecutionContext::FastInput( + const std::string& name) const { + return FastInput(name); +} + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const { @@ -458,6 +501,11 @@ Tensor* ExecutionContext::Output(const std::string& name) const { return Output(name); } +template <> +Tensor* ExecutionContext::FastOutput(const std::string& name) const { + return FastOutput(name); +} + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { @@ -822,6 +870,7 @@ Scope* OperatorWithKernel::PrepareData( auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); input_vars[i] = var; + LOG(ERROR) << "second in " << var_name_item.first << ":" << var_name; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -882,6 +931,7 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); + LOG(ERROR) << "second out " << var_name_item.first << ":" << var_name; } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e359414d15..0aad91dbee 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -191,15 +191,9 @@ class ExecutionContext { return op_.Outputs(name).size(); } - const Variable* InputVar(const std::string& name) const { - auto ipt = op_.Input(name); - return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - } + const Variable* InputVar(const std::string& name) const; - Variable* OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); - } + Variable* OutputVar(const std::string& name) const; const std::vector MultiInputVar( const std::string& name) const { @@ -238,6 +232,22 @@ class ExecutionContext { return var == nullptr ? nullptr : var->GetMutable(); } + template + const T* FastInput(const std::string& name) const { + auto* var = FastInputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* FastOutput(const std::string& name) const { + auto var = FastOutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + const Variable* FastInputVar(const std::string& name) const; + + Variable* FastOutputVar(const std::string& name) const; + template const std::vector MultiInput(const std::string& name) const { auto names = op_.Inputs(name); @@ -303,6 +313,10 @@ class ExecutionContext { template <> const Tensor* ExecutionContext::Input(const std::string& name) const; +template <> +const Tensor* ExecutionContext::FastInput( + const std::string& name) const; + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const; @@ -310,6 +324,9 @@ const std::vector ExecutionContext::MultiInput( template <> Tensor* ExecutionContext::Output(const std::string& name) const; +template <> +Tensor* ExecutionContext::FastOutput(const std::string& name) const; + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 62c55c4f55..b6155ed3dd 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,7 +56,7 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), + return framework::OpKernelType(ctx.FastInput("X")->type(), ctx.device_context()); } }; From cbc7208399e980687e7bd51102d3e84907353fba Mon Sep 17 00:00:00 2001 From: heqiaozhi Date: Tue, 18 Dec 2018 10:15:07 +0800 Subject: [PATCH 403/719] fix doc test=develop --- python/paddle/fluid/async_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 3181654feb..4ca6a5170e 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -301,7 +301,7 @@ class AsyncExecutor(object): save_model command that can be invoked from one of the worker model parameters are saved in servers and upload to save_path of file system Args: - save_path(str): path to file system + save_path(str): save path to file system """ if self.instance is None: raise ValueError( From c631412eab5371cd72c9a7d2f9830870eb953e46 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 17 Dec 2018 22:02:15 +0800 Subject: [PATCH 404/719] fix gc bug test=develop --- paddle/fluid/framework/executor.cc | 2 +- .../tests/unittests/test_eager_deletion_dynamic_rnn_base.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8c3912120b..da9556c6c1 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -100,7 +100,7 @@ static void DeleteUnusedTensors( continue; } auto* var = scope.FindVar(name); - if (var != nullptr) { + if (var == nullptr) { continue; } diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py index e91cfe0b45..89476ee641 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost = network(data, label, len(word_dict)) + cost.persistable = True optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) optimizer.minimize(cost) From 06936a2ff59ba67f6be0526bf97c26a3cf036b18 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 18 Dec 2018 11:16:14 +0800 Subject: [PATCH 405/719] fix 1gpu test=develop --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 3 ++- paddle/fluid/framework/details/parallel_ssa_graph_executor.cc | 2 +- paddle/fluid/framework/parallel_executor.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6bca299813..4a0347d07a 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -51,7 +51,8 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. #ifdef PADDLE_WITH_CUDA - // Find NCCL ID from the global scope. + // All-reduce op_handle can run on the sub-scope, find the nccl id from + // the global scope. if (NoDummyInputSize() == 1 && local_scopes_[0]->FindVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 845c4379e6..2377f2c963 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -59,7 +59,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run( if (pool_) { run_futures.emplace_back(pool_->enqueue(std::move(call))); } else { - call(); + fetch_datas.emplace_back(std::move(call())); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 152b9b2702..0042ccaa4f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -231,7 +231,7 @@ ParallelExecutor::ParallelExecutor( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; - if (build_strategy.enable_parallel_graph_) { + if (build_strategy.enable_parallel_graph_ && places.size() > 1) { // parallel graph mode should initialize nccl by ncclCommInitRank since // it call nccl operator per device per thread. if (nccl_id_var == nullptr) { From 5a6d7fe2ff6c946b2d9fe7816a9ee8a321c1b9fa Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 11:36:42 +0800 Subject: [PATCH 406/719] add mkl,ctc support for windows --- CMakeLists.txt | 12 +-- cmake/cuda.cmake | 3 + cmake/cudnn.cmake | 1 + cmake/external/cub.cmake | 2 +- cmake/external/dlpack.cmake | 2 +- cmake/external/mkldnn.cmake | 43 +++++++--- cmake/external/mklml.cmake | 83 +++++++++++-------- cmake/external/python.cmake | 8 +- cmake/external/warpctc.cmake | 30 +++++-- cmake/external/xbyak.cmake | 4 +- cmake/generic.cmake | 6 +- cmake/inference_lib.cmake | 16 ++-- cmake/operators.cmake | 2 +- cmake/simd.cmake | 73 ++++++++-------- paddle/fluid/framework/CMakeLists.txt | 3 +- .../framework/details/all_reduce_op_handle.cc | 2 +- paddle/fluid/framework/mixed_vector.h | 10 +-- paddle/fluid/framework/op_registry.h | 3 +- .../inference/api/demo_ci/CMakeLists.txt | 15 +++- .../fluid/memory/detail/system_allocator.cc | 1 - paddle/fluid/operators/CMakeLists.txt | 7 +- paddle/fluid/operators/cum_op.h | 2 + .../elementwise/elementwise_mul_mkldnn_op.cc | 3 + .../operators/math/detail/lstm_cpu_kernel.h | 6 ++ paddle/fluid/operators/math/jit_gen.h | 3 + paddle/fluid/platform/cpu_info.cc | 7 +- paddle/fluid/platform/dynload/CMakeLists.txt | 2 - paddle/fluid/platform/dynload/cudnn.cc | 4 + paddle/fluid/platform/dynload/cudnn.h | 2 +- .../fluid/platform/dynload/dynamic_loader.cc | 16 ++++ .../fluid/platform/dynload/dynamic_loader.h | 6 ++ paddle/fluid/platform/dynload/mklml.h | 2 +- paddle/fluid/platform/dynload/tensorrt.h | 2 +- paddle/fluid/platform/dynload/warpctc.h | 2 +- paddle/fluid/platform/port.h | 5 +- paddle/fluid/train/demo/CMakeLists.txt | 18 +++- python/CMakeLists.txt | 16 ++-- python/paddle/fluid/__init__.py | 9 +- python/paddle/fluid/framework.py | 18 ++-- python/setup.py.in | 38 +++++---- 40 files changed, 315 insertions(+), 172 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 653ae4ffe5..efdb451f65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,16 +125,12 @@ if(ANDROID OR IOS) add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() -if (APPLE OR WIN32) +if (APPLE) set(WITH_MKL OFF CACHE STRING - "Disable MKL for building on mac and windows" FORCE) + "Disable MKL for building on mac" FORCE) endif() if (WIN32) - set(WITH_DSO OFF CACHE STRING - "Disable DSO when compiling for Windows" FORCE) - set(WITH_MKL OFF CACHE STRING - "Disable MKL when compiling for Windows" FORCE) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) set(WITH_C_API OFF CACHE STRING @@ -207,10 +203,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of warpctc, nccl, cupti in windows -include(external/warpctc) # download, build, install warpctc +# there is no official support of nccl, cupti in windows include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 414e92eb27..5be7be6413 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -139,10 +139,12 @@ endfunction() message(STATUS "CUDA detected: " ${CUDA_VERSION}) if (${CUDA_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) + add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") @@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") + add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 09bec347db..96a9917e76 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -89,6 +89,7 @@ if(CUDNN_FOUND) if(NOT CUDNN_MAJOR_VERSION) set(CUDNN_VERSION "???") else() + add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index c94849cf4b..f06728de91 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -32,4 +32,4 @@ endif() add_dependencies(cub extern_cub) -LIST(APPEND externl_project_dependencies cub) +LIST(APPEND external_project_dependencies cub) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 94d8fcc668..4587475d79 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -28,4 +28,4 @@ endif() add_dependencies(dlpack extern_dlpack) -LIST(APPEND externl_project_dependencies dlpack) +LIST(APPEND external_project_dependencies dlpack) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index b280db23b9..c29375cd05 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -23,15 +23,14 @@ SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) -IF(WIN32 OR APPLE) +IF(APPLE) MESSAGE(WARNING - "Windows or Mac is not supported with MKLDNN in Paddle yet." + "Mac is not supported with MKLDNN in Paddle yet." "Force WITH_MKLDNN=OFF") - SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE) + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE) return() ENDIF() -SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") @@ -44,10 +43,14 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML") ELSE() MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") ENDIF() -SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") -SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") -SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") -SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") + +IF(NOT WIN32) + SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") + SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") + SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ENDIF(NOT WIN32) + ExternalProject_Add( ${MKLDNN_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} @@ -58,8 +61,15 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + CMAKE_ARGS -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} @@ -67,6 +77,11 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} -DMKLROOT:PATH=${MKLML_ROOT} ) +if(WIN32) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) +else(WIN32) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +endif(WIN32) ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) @@ -85,10 +100,14 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) # copy the real so.0 lib to install dir # it can be directly contained in wheel or capi -SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) -ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} - COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - DEPENDS mkldnn) +if(WIN32) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll) +else(WIN32) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) + ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} + DEPENDS mkldnn) +endif(WIN32) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) IF(WITH_C_API) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index dc5427acd4..3da552e319 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,59 +16,76 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) -IF(WIN32 OR APPLE) +IF(APPLE) MESSAGE(WARNING - "Windows or Mac is not supported with MKLML in Paddle yet." + "Mac is not supported with MKLML in Paddle yet." "Force WITH_MKLML=OFF") SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) return() ENDIF() INCLUDE(ExternalProject) - -SET(MKLML_PROJECT "extern_mklml") -IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") -SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") -SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) -SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) -SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +if(WIN32) + SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) +else() + SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +endif() SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") -INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) +if(WIN32) + MESSAGE(WARNING + "Please download the MKLML and and put it at " ${THIRD_PARTY_PATH}/install/mklml) +else() + SET(MKLML_PROJECT "extern_mklml") + IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + ENDIF() + MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") + SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") + SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") -FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(MKLML)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n" - " DESTINATION ${MKLML_DST_DIR})\n") + FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(MKLML)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n" + " DESTINATION ${MKLML_DST_DIR})\n") -ExternalProject_Add( - ${MKLML_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${MKLML_SOURCE_DIR} - DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz - && tar zxf ${MKLML_VER}.tgz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} -) + ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz + && tar zxf ${MKLML_VER}.tgz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} + ) +endif() + + +INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) -ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +if(NOT WIN32) + ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +endif() LIST(APPEND external_project_dependencies mklml) IF(WITH_C_API) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index a3599dd798..edfb655541 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -23,9 +23,12 @@ FIND_PACKAGE(PythonLibs ${PY_VERSION}) if(WIN32) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" -"from distutils import sysconfig as s;import sys;import struct; +"from distutils import sysconfig as s;import sys;import struct;import sysconfig; print(sys.prefix); print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); +print(sysconfig.get_platform()); +print(sysconfig.get_config_var('py_version_nodot')); +print(sysconfig.get_config_var('SOABI')); " RESULT_VARIABLE _PYTHON_SUCCESS OUTPUT_VARIABLE _PYTHON_VALUES @@ -41,6 +44,9 @@ print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) + list(GET _PYTHON_VALUES 2 SYS_PLATFORM) + list(GET _PYTHON_VALUES 3 PYTHON_SHORT_VERSION_NODOT) + list(GET _PYTHON_VALUES 4 PYTHON_SOABI) # Make sure all directory separators are '/' string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 07e1137e16..7b937c93fe 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" # Used in unit test test_WarpCTCLayer SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() SET(USE_OMP ON) ENDIF() +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" + GIT_REPOSITORY ${WARPCTC_REPOSITORY} PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} @@ -59,6 +67,18 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +IF(WIN32) + IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}") + add_custom_command(TARGET extern_warpctc POST_BUILD + COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + ENDIF() + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 384c2f9328..42e39fb813 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -13,8 +13,8 @@ # limitations under the License. set(WITH_XBYAK ON) -if(WIN32 OR APPLE) - SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) +if(APPLE) + SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in MacOS" FORCE) return() endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index a8b9dcfcf5..c6fe2e970d 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -267,7 +267,11 @@ function(cc_library TARGET_NAME) list(APPEND cc_library_DEPS dynload_mklml) endif() add_dependencies(${TARGET_NAME} mklml) - target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + if(WIN32) + target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) + else(WIN32) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + endif(WIN32) endif() # remove link to python, see notes at: # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 5aa7a8a752..a5b70b3c33 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -115,20 +115,20 @@ if (NOT PROTOBUF_FOUND OR WIN32) ) endif () -if (NOT CBLAS_FOUND) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") - copy(openblas_lib - SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include - DSTS ${dst_dir} ${dst_dir} - DEPS extern_openblas - ) -elseif (WITH_MKLML) +if (WITH_MKLML) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") copy(mklml_lib SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} DEPS mklml ) +elseif (NOT CBLAS_FOUND OR WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") + copy(openblas_lib + SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir} + DEPS extern_openblas + ) endif () if (WITH_MKLDNN) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2ced43f9e6..70d159b4f3 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 86096d4fea..566dc75fda 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,46 +57,43 @@ int main() return 0; }" SSE3_FOUND) -# disable AVX by default on windows -if(NOT WIN32) - # Check AVX - set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) - set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; - }" AVX_FOUND) +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) - # Check AVX 2 - set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) - set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; - }" AVX2_FOUND) +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) - # Check AVX512F - set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) - set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; - }" AVX512F_FOUND) -endif(NOT WIN32) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 225dfb3e70..90083f690f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -15,8 +15,7 @@ function(windows_symbolic TARGET) file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) add_custom_command(OUTPUT ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 9eaff1f560..de7c845884 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (NoDummyInputSize() == 1 && local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6940250c3f..c3a044d22c 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable memory::AllocationPtr gpu_; + mutable paddle::memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6d39bb3c52..2c1648c81f 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,7 +23,8 @@ limitations under the License. */ #include #include -#include "glog/logging.h" // For VLOG() +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 8d0d96d391..f42ee9a697 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -89,12 +89,21 @@ endif() if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + if(NOT WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + else(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX} + ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif(WIN32) set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") if(EXISTS ${MKLDNN_PATH}) include_directories("${MKLDNN_PATH}/include") - set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + if(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) + else(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif(WIN32) endif() else() set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 3e8fb83e9d..307c348822 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -17,7 +17,6 @@ limitations under the License. */ #ifdef _WIN32 #include -#include // VirtualLock/VirtualUnlock #else #include // for mlock and munlock #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f..95ad67e33e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -44,9 +44,8 @@ endif() register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) - # warpctc_op needs cudnn 7 above -if (WITH_GPU AND NOT WIN32) +if (WITH_GPU) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() @@ -64,9 +63,7 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 999fdcff90..7c0fda4169 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index c600d1e3d7..bf9aef9135 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/operators/math/jit_kernel.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index ccbd05c82a..2e3779ff08 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -17,6 +17,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" +#if defined(_WIN32) +#if defined(__AVX2__) || defined(__AVX__) +inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } +#endif +#endif + namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h index 6abf3434cc..2bc740e598 100644 --- a/paddle/fluid/operators/math/jit_gen.h +++ b/paddle/fluid/operators/math/jit_gen.h @@ -18,6 +18,9 @@ limitations under the License. */ #include #include "paddle/fluid/platform/macros.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index f9a32bfa4c..1642c17809 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" +#if defined(_WIN32) +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#endif + #ifdef PADDLE_WITH_XBYAK #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" @@ -22,9 +26,8 @@ limitations under the License. */ #ifdef __APPLE__ #include #include - #elif defined(_WIN32) -#define NOMINMAX // msvc max/min macro conflict with std::min/max +#define WIN32_LEAN_AND_MEAN #include #else #include diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 5939c500c9..07159d4a12 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,9 +16,7 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) -if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) -endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index f3cd3b2bbe..91d9a1ef01 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_R6 +CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); +#endif + #ifdef CUDNN_DNN_ROUTINE_EACH_R7 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 550fe2edee..2f4f8101e4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using cudnn_func = decltype(&::__name); \ std::call_once(cudnn_dso_flag, []() { \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index cc5cda6106..15d5168366 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,6 +53,12 @@ namespace platform { namespace dynload { static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; +#endif + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -173,6 +181,8 @@ void* GetCublasDsoHandle() { void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); #endif @@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -201,6 +213,8 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif @@ -225,6 +239,8 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 84fd2ce998..edb4c649ad 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -18,6 +18,12 @@ namespace paddle { namespace platform { namespace dynload { +#ifndef _WIN32 +#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) +#else +#define DECLARE_TYPE(__name, ...) decltype(auto) +#endif + void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index f0a9736623..944b00bae1 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -34,7 +34,7 @@ extern void* mklml_dso_handle; #define DYNAMIC_LOAD_MKLML_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using mklmlFunc = decltype(&::__name); \ std::call_once(mklml_dso_flag, []() { \ mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 5d67658b94..751aa54b1a 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle; #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ std::call_once(tensorrt_dso_flag, []() { \ tensorrt_dso_handle = \ diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 18ed9956f1..bc1977b05d 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -34,7 +34,7 @@ extern void* warpctc_dso_handle; #define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using warpctcFunc = decltype(&::__name); \ std::call_once(warpctc_dso_flag, []() { \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index ad070171df..41388d8959 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -37,6 +37,10 @@ #define GOOGLE_GLOG_DLL_DECL #include // _popen, _pclose #include +#ifdef _WINSOCKAPI_ +/* Prevent inclusion of winsock.h in windows.h */ +#define WIN32_LEAN_AND_MEAN +#endif #include #include // std::accumulate in msvc #ifndef S_ISDIR // windows port for sys/stat.h @@ -55,7 +59,6 @@ static void *dlsym(void *handle, const char *symbol_name) { static void *dlopen(const char *filename, int flag) { std::string file_name(filename); - file_name.replace(0, file_name.size() - 1, '/', '\\'); HMODULE hModule = LoadLibrary(file_name.c_str()); if (!hModule) { throw std::runtime_error(file_name + " not found."); diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt index eabb51d370..af033fa740 100644 --- a/paddle/fluid/train/demo/CMakeLists.txt +++ b/paddle/fluid/train/demo/CMakeLists.txt @@ -35,16 +35,26 @@ add_executable(demo_trainer demo_trainer.cc) if(WITH_MKLDNN) include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include") - set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0) -endif() + if(WIN32) + set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib) + else(WIN32) + set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0) + endif(WIN32) +endif(WITH_MKLDNN) if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so) + if(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib) + else(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so) + endif(WIN32) else() if(APPLE) set(MATH_LIB cblas) - else(APPLE) + elseif(WIN32) + set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib) + else() set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) endif(APPLE) endif() diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 139176b0d6..078d543ba2 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -48,12 +48,18 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in IF(WIN32) # Python would use the .pyd by default under Windows series platform set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/) - get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} - DEPENDS paddle_pybind) + if(NOT WITH_MKLDNN) + get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} + DEPENDS paddle_pybind) + else(NOT WITH_MKLDNN) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + DEPENDS paddle_pybind) + endif(NOT WITH_MKLDNN) ELSE() set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) add_custom_command(OUTPUT ${FLUID_CORE} diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2dea71d7af..fd788d0929 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -102,6 +102,12 @@ def __bootstrap__(): import sys import os import platform + + if os.name == 'nt': + third_lib_path = os.path.abspath(os.path.dirname(__file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core in_test = 'unittest' in sys.modules @@ -128,13 +134,12 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') if os.name != 'nt': - read_env_flags.append('warpctc_dir') read_env_flags.append('cpu_deterministic') if core.is_compiled_with_dist(): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 0897920594..da74fd41fc 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -16,6 +16,7 @@ from __future__ import print_function import collections import contextlib +import os import re import six import sys @@ -27,11 +28,18 @@ from .proto import framework_pb2 try: from . import core except ImportError as e: - raise ImportError( - """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" - if you encounters \"libmkldnn.so not found\" errors. If you have python - installed in other directory, replace \"/usr/local/lib\" with your own - directory. The original error is: \n""" + cpt.get_exception_message(e)) + if os.name == 'nt': + raise ImportError( + """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\" + if you encounters \"mkldnn.dll not found\" errors. If you have python + installed in other directory, replace \"c:\python27\lib" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) + else: + raise ImportError( + """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" + if you encounters \"libmkldnn.so not found\" errors. If you have python + installed in other directory, replace \"/usr/local/lib\" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) except Exception as e: raise e from . import unique_name diff --git a/python/setup.py.in b/python/setup.py.in index 6562046641..f4613dd72d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -158,27 +158,29 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' -if os.name != 'nt': - package_data['paddle.libs']= [] - package_data['paddle.libs']=['libwarpctc' + ext_name] - shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + +package_data['paddle.libs']= [] +package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] +shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + if '${WITH_MKL}' == 'ON': - shutil.copy('${MKLML_LIB}', libs_path) - shutil.copy('${MKLML_IOMP_LIB}', libs_path) - package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] + shutil.copy('${MKLML_SHARED_LIB}', libs_path) + shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) + package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name] if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - # TODO(typhoonzero): use install_name_tool to patch mkl libs once - # we can support mkl on mac. - # - # change rpath of libmkldnn.so.0, add $ORIGIN/ to it. - # The reason is that all thirdparty libraries in the same directory, - # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so. - command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" - if os.system(command) != 0: - raise Exception("patch libmkldnn.so failed, command: %s" % command) - package_data['paddle.libs']+=['libmkldnn.so.0'] + if os.name != 'nt': + # only change rpath in Release mode. + # TODO(typhoonzero): use install_name_tool to patch mkl libs once + # we can support mkl on mac. + # + # change rpath of libmkldnn.so.0, add $ORIGIN/ to it. + # The reason is that all thirdparty libraries in the same directory, + # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so. + command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch libmkldnn.so failed, command: %s" % command) + package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)] shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) if '${WITH_NGRAPH}' == 'ON': # only change rpath in Release mode, From 2f55a04ec64e32adb1ca9da53b137c25ebd51c1c Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Tue, 18 Dec 2018 11:51:35 +0800 Subject: [PATCH 407/719] add refer result comparasion test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 13 ++++++++++--- .../inference/tests/api/analyzer_vis_tester.cc | 16 ++++++++++++++++ paddle/fluid/inference/tests/api/tester_helper.h | 1 + 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 5862fedb9a..46ce61b736 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename ARGS --infer_model=${install_dir}/model) endfunction() +function(inference_analysis_api_test_with_refer_result target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt) +endfunction() + # RNN1 if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") @@ -83,14 +90,14 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") endif() -inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) -# mobilenet with transpose +# mobilenet with transpose op set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") endif() -inference_analysis_api_test(test_analyzer_mobilenet ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 4700afdc86..a8f7d5c446 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -93,6 +93,22 @@ void profile(bool use_mkldnn = false) { SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + std::string line; + std::ifstream file(FLAGS_refer_result); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + auto &output = outputs.front(); + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + CHECK_EQ(numel, refer.data.size()); + for (size_t i = 0; i < numel; ++i) { + CHECK_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); + } + } } TEST(Analyzer_vis, profile) { profile(); } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 4c8bce4600..b07949c196 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -36,6 +36,7 @@ DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); +DEFINE_string(refer_result, "", "reference result for comparison"); DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); From acd4b759233b44b7ef52f65d21fff07a56f681ab Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 18 Dec 2018 12:25:25 +0800 Subject: [PATCH 408/719] skip_opt_set support list (#14845) * test=develop * fix tests. test=develop --- .../fluid/transpiler/memory_optimization_transpiler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 95aafec053..5a7d04ed19 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -484,8 +484,11 @@ def memory_optimize(input_program, if level != 0 and level != 1: raise ValueError("only support opt_level 0 or 1.") - if skip_opt_set is not None and not isinstance(skip_opt_set, set): - raise ValueError("only support skip_opt_set as set.") + if skip_opt_set is not None: + if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list): + skip_opt_set = set(skip_opt_set) + else: + raise ValueError("only support skip_opt_set as set.") global PRINT_LOG PRINT_LOG = print_log if skip_grads: From fb8ae30331f42b6b9ef67c80e0ccb3fffcbf9836 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 12:35:45 +0800 Subject: [PATCH 409/719] fix test=develop --- paddle/fluid/framework/operator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 87f61f3afc..807667e684 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -919,7 +919,7 @@ Scope* OperatorWithKernel::PrepareData( } auto* trans_var = new_scope->Var(var_name); - input_vars[i] = var; + input_vars[i] = trans_var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); From 7cd24b13182bcdcbdb455a430d54d70172e73a59 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 18 Dec 2018 13:15:29 +0800 Subject: [PATCH 410/719] add ir memory optimize. (#14530) * follow comments. test=develop * Fix typo * fix compile error. test=develop * merge develop branch. test=develop * Remove set_equal * Polish code * Delete unused functions test=develop * polish code. test=develop * follow comment * polish code. * fix windows compile error. test=develop * fix op handle. * rerun ci. test=develop * rerun ci. test=develop * rerun macci. test=develop * polish code. test=develop * rewrite sort code. test=develop * remove unused code. test=develop * fix tests. test=develop * fix conflict. test=develop * follow comment. test=develop * merge develop branch. test=develop * fix tests. test=develop * remove ToTypeIndex. test=develop * rerun ci. test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 14 +- .../framework/details/analysis_var_pass.cc | 656 ++++++++++++++++++ .../framework/details/analysis_var_pass.h | 120 ++++ .../details/analysis_var_pass_test.cc | 470 +++++++++++++ .../fluid/framework/details/build_strategy.cc | 34 +- .../fluid/framework/details/build_strategy.h | 11 + .../details/early_delete_op_handle.h | 140 ++++ .../details/memory_early_delete_pass.cc | 117 ++++ .../details/memory_early_delete_pass.h | 32 + .../framework/details/memory_reuse_types.cc | 155 +++++ .../framework/details/memory_reuse_types.h | 87 +++ .../details/memory_reuse_types_test.cc | 99 +++ .../details/multi_devices_graph_print_pass.cc | 3 +- .../details/multi_devices_graph_print_pass.h | 5 +- .../fluid/framework/details/op_handle_base.h | 2 +- paddle/fluid/framework/ir/graph.cc | 5 +- paddle/fluid/framework/ir/graph_helper.cc | 3 +- paddle/fluid/framework/ir/graph_helper.h | 1 + paddle/fluid/framework/ir/node.cc | 8 + paddle/fluid/framework/ir/node.h | 5 +- paddle/fluid/framework/parallel_executor.cc | 19 +- paddle/fluid/framework/tensor_test.cc | 16 + paddle/fluid/pybind/pybind.cc | 8 + python/paddle/fluid/__init__.py | 2 +- .../unittests/parallel_executor_test_base.py | 2 + .../unittests/test_ir_memory_optimize_pass.py | 123 ++++ .../memory_optimization_transpiler.py | 61 +- 27 files changed, 2169 insertions(+), 29 deletions(-) create mode 100644 paddle/fluid/framework/details/analysis_var_pass.cc create mode 100644 paddle/fluid/framework/details/analysis_var_pass.h create mode 100644 paddle/fluid/framework/details/analysis_var_pass_test.cc create mode 100644 paddle/fluid/framework/details/early_delete_op_handle.h create mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.cc create mode 100644 paddle/fluid/framework/details/memory_early_delete_pass.h create mode 100644 paddle/fluid/framework/details/memory_reuse_types.cc create mode 100644 paddle/fluid/framework/details/memory_reuse_types.h create mode 100644 paddle/fluid/framework/details/memory_reuse_types_test.cc create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 97f7713d97..63a68ba3a5 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) +cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) - +cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle + all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +if (WITH_GPU) + list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) +endif() +cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) +cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) @@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass multi_batch_merge_pass) + fuse_elewise_add_act_pass multi_batch_merge_pass + memory_optimize_pass) diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc new file mode 100644 index 0000000000..223b9da3cf --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.cc @@ -0,0 +1,656 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +DEFINE_bool(enable_subgraph_optimize, false, + "SubGraph also reuse global graph variables, it will reduce the " + "memory occupation" + "but a higher risk of memory reuse error. default disabled."); +DEFINE_string(memory_optimize_debug, "", + "debug the operator output variable when do the variable reuse." + "memory reuse pass." + "only for debug, default disabled."); + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + +std::unique_ptr AnalysisVarPass::ApplyImpl( + std::unique_ptr graph) const { + auto nodes = graph->Nodes(); + auto subblock_vars = GetSubBlockVars(nodes); + skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + + cfg_.reset(new details::ControlFlowGraph(*graph)); + cfg_->LiveVariableAnalysis(); + InitSSAGraphNodes(); + + int reuse_id = 0; + for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { + auto& op = cfg_->Ops()[idx]; + auto* op_desc = op->Op(); + // some op in graph has no op desc + if (op_desc == nullptr) continue; + if (OpHasSubBlock(op_desc)) { + if (FLAGS_enable_subgraph_optimize) { + SubGraphOptimize(op_desc); + } else { + VLOG(3) << op->Name() + << " has subblock, but disable subgraph optimize. skipped."; + continue; + } + } + + for (auto& var : op->outputs) { + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.NodeMatch(var); + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } + if (cache != nullptr) { + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." + << var->Name() << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + } + + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + } + } + // fill the pool + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + if (var_node == nullptr) continue; + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node, op); + } + } + } + } + graph->ResolveHazard(var_nodes_); + + // For early delete pass. use GraphNodePool load the unlived vars. + // 1. find all deps op for each unlived var in memory pool. + for (auto& op : graph->Nodes()) { + for (auto& var : op->inputs) { + if (pool_.Has(var)) { + pool_.Insert(var, op); + } + } + } + // 2. convert ir node based memory pool to graph node + // because Node* maybe released bettwen passes. + auto& graph_pool = graph->Get(kGraphNodePool); + for (auto it = pool_.begin(); it != pool_.end(); ++it) { + std::unordered_set descs; + for (auto& op : it->second) { + PADDLE_ENFORCE(op->IsOp()); + descs.insert(op->Op()); + } + graph_pool.push_back(std::make_pair(it->first->Name(), descs)); + } + + return graph; +} + +void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { + // conditional block, while op and their grad op + auto* sub_block_desc = + AttrReader(op_desc->GetAttrMap()).Get("sub_block"); + + // create a mirror block to construct an IR Graph. + ProgramDesc prog; + auto* copy_block = prog.MutableBlock(0); + for (auto* op : sub_block_desc->AllOps()) { + auto* copy_op = copy_block->AppendOp(); + copy_op->CopyFrom(*op); + copy_op->Flush(); + } + + for (auto* var : sub_block_desc->AllVars()) { + auto* copy_var = copy_block->Var(var->Name()); + copy_var->SetDataType(var->GetDataType()); + // only lod tensor can be reused. So ignore the multiple dims case. + copy_var->SetType(var->GetType()); + copy_var->SetShape(var->GetShape()); + copy_var->SetPersistable(var->Persistable()); + } + + ir::Graph sub_graph(prog); + std::unordered_set sub_graph_all_ops; + FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) { + // sub_graph_all_ops.emplace(var); + if (var->IsVar() && !var->IsCtrlVar()) { + sub_graph_all_ops.emplace(var); + } + }); + int sub_reuse_id = 0; + // subgraph nodes is unordered, reuse need to follow the desc order. + // find the right op node through the descs + for (auto* sub_op_desc : sub_block_desc->AllOps()) { + ir::Node* sub_op = nullptr; + for (auto* node : sub_graph_all_ops) { + if (node->Op() == sub_op_desc) { + sub_op = node; + break; + } + } + PADDLE_ENFORCE(sub_op != nullptr); + for (auto* var : sub_op->outputs) { + if (NodeCanReused(var)) { + ir::Node* cache = pool_.NodeMatch(var); + if (cache != nullptr) { + if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { + continue; + } + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(sub_reuse_id++), DebugString(var), + DebugString(cache), node_idx_in_pool, + static_cast(pool_.size())); + // NOTE(dzh): subblock is not in IR graph. Modify the block_desc + // immediately to make the subblock variable reuse strategy take + // effect. Because it is a single op in graph. No need to + // update the ir nodes. + sub_op_desc->Rename(var->Name(), cache->Name()); + if (sub_op_desc->Block()->HasVar(var->Name())) { + sub_op_desc->Block()->RemoveVar(var->Name()); + } + } + } + } + } +} + +std::unordered_set AnalysisVarPass::GetSubBlockVars( + const std::unordered_set& nodes) const { + std::unordered_set vars; + for (auto& op : nodes) { + if (!op->IsOp() || op->Op() == nullptr) continue; + auto* op_desc = op->Op(); + if (OpHasSubBlock(op_desc)) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + vars.insert(inputs.begin(), inputs.end()); + vars.insert(outputs.begin(), outputs.end()); + } + } + return vars; +} + +void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +void AnalysisVarPass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + if (var_nodes_.empty()) { + for (auto* op : cfg_->Ops()) { + for (auto* node : op->inputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } + } +} + +void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, ir::Graph* graph) const { + // if replace happens, we need to create a newer version cache_var + // but use the same dims/data_type with var. + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + } + + // release node of unused var in graph + for (auto* node : var_nodes_[var]) { + graph->RemoveNode(node); + } + var_nodes_.at(var).clear(); +} + +bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { + if (!node->IsVar() || node->IsCtrlVar()) return false; + auto* desc = node->Var(); + auto type = desc->GetType(); + if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || + desc->GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node->Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + if (skip_set_.count(name)) return false; + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + // op output force generated in cpu, can not be reused. + return framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + return true; +} + +bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h new file mode 100644 index 0000000000..144204beaf --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.h @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { +constexpr char kAllOpDescs[] = "all_op_descs"; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph); +// sort op in bfs order +std::vector BFSSortGraphOps(const ir::Graph& graph); + +class ControlFlowGraph; + +class AnalysisVarPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + private: + // fill the variable map(var_nodes) by version. + void InitSSAGraphNodes() const; + // update program descs + void RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, size_t idx) const; + // update ir nodes + void RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, size_t idx, + ir::Graph* graph) const; + + void SubGraphOptimize(OpDesc* op_desc) const; + // valid a tensor can be reuse or not + bool NodeCanReused(ir::Node* node) const; + // scan subblock and collect the output/input variables. + std::unordered_set GetSubBlockVars( + const std::unordered_set&) const; + // check op has subblock or not + bool OpHasSubBlock(OpDesc* desc) const; + + private: + // Reuse Node Pool, Owned. + mutable OrderedNodePairPool pool_; + // controlflow Graph + mutable std::unique_ptr cfg_; + // skip set + mutable std::unordered_set skip_set_; + // var nodes + mutable std::map> var_nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // For IR Graph in parallelexecutor + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/analysis_var_pass_test.cc new file mode 100644 index 0000000000..9bc4fd33f7 --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +template +inline static std::string DebugString(const Container& c) { + std::stringstream ss; + for (auto& item : c) { + ss << item << " "; + } + return ss.str(); +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f24..779a9ed523 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -14,11 +14,16 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" +#include +#include + +#include "paddle/fluid/framework/details/memory_reuse_types.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" namespace paddle { @@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } VLOG(1) << "CollectiveContext:" << context->String(); + // NOTE(dzh): memory optimize should be a runtime pass. + // However, after multi_devices_pass, VarHandle, OpHandle is + // the de-fact IR, any reuse on Graph is meaningless. + // A side-effect of that, memory optimize cannot forsee the fetched vars + // , so fetchlist should be set persistable before call the Run interface. + if (strategy.memory_optimize_) { + auto analysis_var_pass = AppendPass("analysis_var_pass"); + } // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", @@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy_.debug_graphviz_path_); + const std::string graph_path = + string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(), + "_multi_devices_graph"); + multi_devices_print_pass->Set(kGraphvizPath, + new std::string(graph_path)); multi_devices_print_pass->Set( "graph_printer", new details::GraphvizSSAGraphPrinter); } @@ -127,7 +143,6 @@ std::unique_ptr BuildStrategy::Apply( CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); - for (std::shared_ptr &pass : pass_builder_->AllPasses()) { if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); @@ -145,6 +160,17 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { + const std::vector *all_op_descs = + new std::vector(main_program.Block(0).AllOps()); + graph->Set>(kAllOpDescs, + all_op_descs); // take ownership + graph->Set(kGraphNodePool, + new GraphNodePool); // take ownership + + pass->Erase(kAllOpDescs); + pass->SetNotOwned>(kAllOpDescs, all_op_descs); + } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -166,6 +192,7 @@ std::unique_ptr BuildStrategy::Apply( } return graph; } + } // namespace details } // namespace framework } // namespace paddle @@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); +USE_PASS(analysis_var_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be16957..29396501dc 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -60,8 +60,15 @@ struct BuildStrategy { kCustomized = 2, }; + enum class OptimizeStrategy { + // To be Implemented,bruteforce, recursive compute unused var names. + kBruteForce = 0, + kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed. + }; + ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph}; std::string debug_graphviz_path_{""}; @@ -69,6 +76,10 @@ struct BuildStrategy { bool enable_data_balance_{false}; + bool memory_optimize_{false}; + + bool memory_early_delete_{false}; + bool enable_sequential_execution_{false}; bool fuse_broadcast_op_{false}; diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h new file mode 100644 index 0000000000..c8382d34b7 --- /dev/null +++ b/paddle/fluid/framework/details/early_delete_op_handle.h @@ -0,0 +1,140 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class EarlyDeleteOpHandle : public OpHandleBase { + public: + EarlyDeleteOpHandle(ir::Node* node, const Scope* scope, + const platform::Place& place, + const std::vector& names, + GarbageCollector* gc) + : OpHandleBase(node), + scope_(scope), + place_(place), + names_(names), + gc_(gc) { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(place); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } +#endif + } + ~EarlyDeleteOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif + } + + std::string Name() const override { return "early_delete"; } + + protected: + void RunImpl() override { + std::vector> tensors; + auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + for (auto& var_name : names_) { + auto* var = local_scope->FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, + string::Sprintf("Local Scope not has var %s", var_name)); + if (var->IsType()) { + tensors.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + tensors.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + } else if (var->IsType()) { + LoDTensorArray* tensor_array = var->GetMutable(); + for (auto& tensor : *tensor_array) { + tensors.emplace_back(tensor.MoveMemoryHolder()); + } + } + } + if (!tensors.empty()) { + ClearTensors(tensors); + } + } + + private: + void ClearTensors( + const std::vector>& tensors) { + if (platform::is_cpu_place(place_)) { + ClearCPUTensors(tensors); + } else { + ClearGPUTensors(tensors); + } + } + + void ClearCPUTensors( + const std::vector>& tensors) { + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + gc->Add(tensors); + } + } + + void ClearGPUTensors( + const std::vector>& tensors) { +#ifdef PADDLE_WITH_CUDA + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = gc->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { + gc_->Add(tensors); + } + } + + bool IsStreamGarabageCollector() const { + return dynamic_cast(gc_) != nullptr; +#endif + } + + const Scope* scope_; + const platform::Place place_; + std::vector names_; + GarbageCollector* gc_; +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext* dev_ctx_; + cudaEvent_t event_; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc new file mode 100644 index 0000000000..06a2451c13 --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_early_delete_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { + std::queue queue; + queue.push(var_in); + do { + auto* var = queue.front(); + queue.pop(); + for (auto* op : var->PendingOps()) { + auto* compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { + return compute_op; + } + for (auto* out_var : op->Outputs()) { + queue.push(out_var); + } + } + } while (!queue.empty()); + return nullptr; +} + +std::unique_ptr MemoryEarlyDeletePass::ApplyImpl( + std::unique_ptr graph) const { + auto& graph_pool = Get(kGraphNodePool); + auto& gcs = Get(kGarbageCollector); + + std::unordered_map> unlived_vars; + unlived_vars.reserve(graph_pool.size()); + for (auto& pair : graph_pool) { + unlived_vars.insert(std::make_pair(pair.first, pair.second)); + } + + auto compare_and_insert_early_delete_op = [&]( + OpHandleBase* op, const std::vector& vars) { + if (unlived_vars.empty()) return; + // unlived vars can be deleted after the last used op has finished. + auto* compute_op = dynamic_cast(op); + const auto& places = Get>(kAllPlaces); + for (auto& var : vars) { + auto* var_handle = dynamic_cast(var); + auto var_name = var->Node()->Name(); + auto& var_place = var_handle->place_; + if (unlived_vars.count(var_name) == 0) continue; + if (!unlived_vars[var_name].empty()) { + if (compute_op != nullptr && + unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { + unlived_vars[var_name].erase(compute_op->Node()->Op()); + } + continue; + } + + if (var_handle == nullptr || !var_handle->Node()->IsVar() || + var_handle->Node()->IsCtrlVar()) + continue; + + // shameless copyed from reference count pass. + if (compute_op == nullptr) { + // use next computation op scope + compute_op = FindNextComputationOpHandle(var_handle); + } + auto* early_delete_node = + graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); + GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); + auto* early_delete_handle = new EarlyDeleteOpHandle( + early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); + if (compute_op->Outputs().empty()) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + compute_op->AddOutput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + } + early_delete_handle->AddInput(compute_op->Outputs().front()); + VLOG(5) << "Add early delete op " << var_name << " to Operator" + << compute_op->Name(); + } + }; + + auto all_ops = ir::FilterByNodeWrapper(*graph); + for (auto& op : all_ops) { + compare_and_insert_early_delete_op(op, op->Inputs()); + compare_and_insert_early_delete_op(op, op->Outputs()); + } + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(memory_early_delete_pass, + paddle::framework::details::MemoryEarlyDeletePass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h new file mode 100644 index 0000000000..8215aa1b2b --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/details/early_delete_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class MemoryEarlyDeletePass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc new file mode 100644 index 0000000000..2b9ff518b9 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace details { + +size_t NodeSizeInBytes(ir::Node* n) { + auto* desc = FindVarDescInBlock(n); + auto shape = desc->GetShape(); + size_t type_size = SizeOfType(desc->GetDataType()); + int size = 1; + for (auto& s : shape) { + size *= s; + } + return type_size * std::abs(size); +} + +std::string DebugStringImpl(VarDesc* var) { + std::stringstream ss; + ss << var->Name(); + ss << "["; + try { + auto shape = var->GetShape(); + for (size_t i = 0; i < shape.size(); ++i) { + if (i != shape.size() - 1) { + ss << shape[i] << ","; + } else { + ss << shape[i]; + } + } + ss << "]"; + } catch (...) { + ss << "Var has no VarDesc !!! Name:" << var->Name(); + } + return ss.str(); +} + +std::string DebugString(ir::Node* var) { + return DebugStringImpl(FindVarDescInBlock(var)); +} +// return DebugString(var->Var()); } + +// NOTE(dzh): based ir node, if a large node has been reused +// by a small size node, then next time it appear in pool, it will +// have the small size. Find the original node shap from blockdesc. +VarDesc* FindVarDescInBlock(ir::Node* n) { + PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); + BlockDesc* block = n->inputs[0]->Op()->Block(); + PADDLE_ENFORCE(block->HasVar(n->Name()), + string::Sprintf("Block do not has var %s", n->Name())); + return block->FindVar(n->Name()); +} + +struct NodeComparator { + bool operator()(ir::Node* lhs, ir::Node* rhs) const { + auto* lhs_desc = FindVarDescInBlock(lhs); + auto* rhs_desc = FindVarDescInBlock(rhs); + auto lhs_shape = lhs_desc->GetShape(); + auto rhs_shape = rhs_desc->GetShape(); + if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || + (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { + return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + } else { + return false; + } + } +}; + +void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { + PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); + PADDLE_ENFORCE(op->IsOp()); + if (mark_table_.count(var->Name()) != 0) { + mark_table_[var->Name()]->second.insert(op); + return; + } + + auto* var_desc = FindVarDescInBlock(var); + auto var_shape = var_desc->GetShape(); + int batch_size = static_cast(var_shape[0]); + + NodeComparator compare_node; + Iter it = nodes_.begin(); + while (it != nodes_.end()) { + auto* cache_desc = FindVarDescInBlock(it->first); + int cache_batch_size = cache_desc->GetShape()[0]; + if ((cache_batch_size == -1 && batch_size == -1) || + (cache_batch_size != -1 && batch_size != -1)) { + if (compare_node(it->first, var)) { + ++it; + } else { + break; + } + } else if (cache_batch_size == -1 && batch_size != -1) { + ++it; + } else if (cache_batch_size != -1 && batch_size == -1) { + break; + } + } + + it = + nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + mark_table_[var->Name()] = it; +} + +int OrderedNodePairPool::GetIndex(ir::Node* var) { + return std::distance(nodes_.begin(), mark_table_[var->Name()]); +} + +ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { + ir::Node* found_node = nullptr; + NodeComparator compare_node; + + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + if (compare_node(var, it->first)) { + found_node = it->first; + break; + } + } + return found_node; +} + +void OrderedNodePairPool::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); +} + +std::string OrderedNodePairPool::ToString() const { + std::stringstream ss; + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + ss << DebugString(it->first) << " "; + } + return ss.str(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h new file mode 100644 index 0000000000..9a9c1d948e --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +constexpr char kFetchedVars[] = "fetched_vars"; +constexpr char kGraphNodePool[] = "graph_node_pool"; + +// NOTE(dzh): Variable and the operators use the var. +// for early delete pass. +// Because analysis var pass build base on ir::Node, which maybe released +// or modified between passes, so we use OpDesc* to mark ops. +using GraphNodePool = std::vector< + std::pair /* ops */>>; + +// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size is determined in runtime. +// the node batch_size equal -1 always ranking in the front than the node not. +// For example, +// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. +// O(1) insert, delete +class OrderedNodePairPool { + public: + using NodePair = std::pair>; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + + void Insert(ir::Node* var, ir::Node* op); + + void Erase(ir::Node* var); + + bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } + + ir::Node* NodeMatch(ir::Node* var) const; + // map store non-const iterator, can not promise const + int GetIndex(ir::Node* var); + // pool all node to string + std::string ToString() const; + + Iter begin() { return nodes_.begin(); } + Iter end() { return nodes_.end(); } + ConstIter begin() const { return nodes_.begin(); } + ConstIter end() const { return nodes_.end(); } + size_t size() const { return nodes_.size(); } + + private: + // for searching. + std::unordered_map mark_table_; + // node swap pairs. var -> ops dep var + std::list nodes_; +}; + +// node memory size in bytes +size_t NodeSizeInBytes(ir::Node* n); + +std::string DebugString(ir::Node* var); + +// std::string DebugString(VarDesc* var); +VarDesc* FindVarDescInBlock(ir::Node* n); + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc new file mode 100644 index 0000000000..d2fabf5ce0 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(OrderedNodePairPool, Normal) { + OrderedNodePairPool pool; + std::vector> nodes; + + // clang-format off + std::vector> shapes = {{-1, 10}, + {-1, 20}, + {1, 2}, + {5, 2}, + {10, 20}, + {-1, 2, 5}, + {-1, 1, 5}, + {-1, 1}}; + // clang-format on + const int COUNT = shapes.size(); + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + for (int i = 0; i < COUNT; ++i) { + auto desc = block_desc->Var(std::to_string(i)); + desc->SetShape(shapes[i]); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get(), op.get()); + } + + // assert its order and interface. + std::cout << pool.ToString() << std::endl; + pool.Erase(nodes.front().get()); + std::cout << pool.ToString() << std::endl; + + ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); + ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + + { + auto v1 = block_desc->Var("11"); + v1->SetShape({-1, 256, 56, 56}); + std::unique_ptr node1 = ir::CreateNodeForTest(v1); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(cache, nullptr); + } + { + auto v2 = block_desc->Var("12"); + v2->SetShape({-1, 2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v2); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + } + { + auto v3 = block_desc->Var("13"); + v3->SetShape({2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v3); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index 8f92f0948d..c203073845 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace paddle REGISTER_PASS(multi_devices_print_pass, - paddle::framework::details::SSAGraghBuilderWithPrinter); + paddle::framework::details::SSAGraghBuilderWithPrinter) + .RequirePassAttr(paddle::framework::details::kGraphvizPath); diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h index c00685fa16..b06c87a5c1 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kGraphvizPath[] = "debug_graphviz_path"; + class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} @@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass { std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { std::unique_ptr fout( - new std::ofstream(Get("debug_graphviz_path"))); + new std::ofstream(Get(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); Get("graph_printer").Print(*graph, *fout); return graph; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c61..b1a82e8771 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -25,7 +25,7 @@ namespace paddle { namespace framework { namespace details { -constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; +constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@"; // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe2..8670dcfed7 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -162,7 +162,10 @@ void Graph::ResolveHazard( (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; const auto &read_ops = (*it_old)->outputs; - PADDLE_ENFORCE(write_op, "The write_op should not be empty."); + PADDLE_ENFORCE( + write_op, + string::Sprintf("The write_op of var %s should not be empty.", + (*it_new)->Name())); // Add write after write dependence ir::Node *upstream_op = diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d2d28793c4..d99f856d8f 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include DEFINE_string(print_sub_graph_dir, "", @@ -121,7 +122,7 @@ std::map> BuildOperationAdjList( } size_t GraphNum(const Graph &graph) { - std::unordered_set nodes = graph.Nodes(); + std::unordered_set nodes(graph.Nodes()); std::unordered_set visited_nodes; visited_nodes.reserve(nodes.size()); std::deque q_nodes; diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 8d92c40668..be525151f9 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { + // Test if the graph contains circle. bool HasCircle(const Graph &graph); diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index eac67108e2..45d81b9373 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,6 +30,14 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } +std::unique_ptr CreateNodeForTest(VarDesc *var_desc) { + return std::unique_ptr(new Node(var_desc)); +} + +std::unique_ptr CreateNodeForTest(OpDesc *op_desc) { + return std::unique_ptr(new Node(op_desc)); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f1..89dcc677b5 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/macros.h" @@ -125,6 +124,8 @@ class Node { friend class Graph; friend std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); + friend std::unique_ptr CreateNodeForTest(VarDesc* var_desc); + friend std::unique_ptr CreateNodeForTest(OpDesc* op_desc); explicit Node(const std::string& name, Type type) : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} @@ -152,7 +153,9 @@ class Node { std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); +std::unique_ptr CreateNodeForTest(VarDesc* var_desc); +std::unique_ptr CreateNodeForTest(OpDesc* op_desc); } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index eb4baa06b5..7e3fe02eaf 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include #include #include #include @@ -93,6 +94,7 @@ class ParallelExecutorPrivate { } } + BuildStrategy build_strategy_; std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -169,6 +171,14 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + if (build_strategy_.memory_early_delete_) { + auto early_delete_pass = + ir::PassRegistry::Instance().Get("memory_early_delete_pass"); + early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graph = early_delete_pass->Apply(std::move(graph)); + } + VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; @@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor( build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); #endif - auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { graph = member_->PrepareGCAndRefCnts(std::move(graph), @@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor( if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle +USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index a0a9a57360..83dea86390 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -74,6 +74,22 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), platform::CPUPlace()); EXPECT_EQ(p1, p2); + + float* p3 = nullptr; + float* p4 = nullptr; + // set src_tensor a different type but smaller size. + // memory block is supposed to be unchanged. + auto* tmp = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + p3 = reinterpret_cast(tmp); + EXPECT_EQ(p1, p3); + + // set src_tensor a different type but bigger size. + // memory block is supposed to be changed. + auto* tmp2 = src_tensor.mutable_data( + framework::make_ddim({2, 2, 3}), platform::CPUPlace()); + p4 = reinterpret_cast(tmp2); + EXPECT_NE(p1, p4); } // Not sure if it's desired, but currently, Tensor type can be changed. { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 017598e170..737ae2dd9c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "memory_optimize", + [](const BuildStrategy &self) { return self.memory_optimize_; }, + [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) + .def_property( + "memory_early_delete", + [](const BuildStrategy &self) { return self.memory_early_delete_; }, + [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2dea71d7af..b00510d443 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -150,7 +150,7 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'selected_gpus' + 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 86f861674c..e2a9fc183e 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, + use_ir_memory_optimize=False, fuse_elewise_add_act_ops=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, @@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py new file mode 100644 index 0000000000..6ca65c5d3b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -0,0 +1,123 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def _feed_data_helper(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + return img, label + + +def simple_fc_net(use_feed): + x, y = _feed_data_helper(use_feed) + hidden_layer = 4 + for _ in range(hidden_layer): + x = fluid.layers.fc(input=x, size=20, act='relu') + y_predict = fluid.layers.fc(input=x, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +def fc_with_inplace_net(use_feed): + x, y = _feed_data_helper(use_feed) + fc = fluid.layers.fc(input=x, size=20, act='relu') + fc = fluid.layers.fc(input=fc, size=10, act='relu') + reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5]) + reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2]) + y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax') + cost = fluid.layers.cross_entropy(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + return avg_cost + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def _dummy_data(self): + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_ir_and_python_memory_optimize(self, model, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return + + img, label = self._dummy_data() + first_loss0, last_loss0 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=False) + first_loss1, last_loss1 = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + memory_opt=False, + use_ir_memory_optimize=True) + for loss in zip(first_loss0, first_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + for loss in zip(last_loss0, last_loss1): + self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_net(self): + self._compare_ir_and_python_memory_optimize(simple_fc_net, False) + self._compare_ir_and_python_memory_optimize(simple_fc_net, True) + + def test_fc_with_reshape_net(self): + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False) + self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 5a7d04ed19..7b530ba617 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -43,6 +43,7 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), ("conditional_block", "conditional_block_grad")] PRINT_LOG = False +FLAGS_memory_optimize = "" class OrderedSet(MutableSet): @@ -121,6 +122,7 @@ class ControlFlowGraph(object): self._defs = defaultdict(OrderedSet) self._live_in = defaultdict(OrderedSet) self._live_out = defaultdict(OrderedSet) + self._skip_opt = skip_opt self.pool = [] @@ -144,7 +146,6 @@ class ControlFlowGraph(object): for i in range(self.op_size): self._uses[i].update(self._ops[i].input_arg_names()) self._defs[i].update(self._ops[i].output_arg_names()) - self._live_in[i] = self._uses[i] def _update_graph(self, old_name, new_name, begin_idx=0): for i in range(begin_idx, self.op_size): @@ -177,20 +178,52 @@ class ControlFlowGraph(object): worklist.append(d) def _fill_pool(self, i, is_forward): + def comparator(x, cache): + x_shape = x[1] + cache_shape = cache[1] + x_size = abs(reduce(lambda x, y: x * y, x_shape)) + cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) + if (x_shape[0] == -1 and cache_shape[0] == -1) or \ + (x_shape[0] != -1 and cache_shape[0] != -1) : + return x_size <= cache_size + else: + return False + + def find_var_in_block(x): + known_vars = set() + for op in self._ops: + known_vars.update(op.output_arg_names()) + return x in known_vars + block_desc = self._ops[i].block() in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) # NOTE: must sort the in_diff set for cases that get different cache var. # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in in_diff + x for x in sorted(in_diff) if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: for var_name in can_optimize: cache = (var_name, self._find_var(block_desc, var_name, is_forward).shape()) - if cache not in self.pool: - self.pool.append(cache) + if cache not in self.pool and find_var_in_block(var_name): + i = 0 + while i < len(self.pool): + mycache = self.pool[i] + mysize = mycache[1][0] + cache_size = cache[1][0] + if (mysize == -1 and cache_size == -1) or \ + (mysize != -1 and cache_size != -1): + if comparator(mycache, cache): + i += 1 + else: + break + elif mysize == -1 and cache_size != -1: + i += 1 + elif mysize != -1 and cache_size == -1: + break + self.pool.insert(i, cache) def _get_diff(self, a, b): u = a & b @@ -229,7 +262,7 @@ class ControlFlowGraph(object): def _update_skip_opt_set(self): for i in range(self.op_size): op = self._ops[i] - if op.type() == "fill_constant" and op.attr("force_cpu") == True: + if op.has_attr("force_cpu") and op.attr("force_cpu") == True: self._skip_opt.update(op.output_arg_names()) def release_memory(self, skip_opt_set=None): @@ -281,6 +314,7 @@ class ControlFlowGraph(object): # update skip set to meet users' demand if skip_opt_set: self._skip_opt.update(skip_opt_set) + counter = 0 for i in range(self.op_size): op = self._ops[i] if op.type() in SUB_BLOCK_OPS: @@ -301,6 +335,9 @@ class ControlFlowGraph(object): # If x is both in uses and defs, it can not be optimized! if x in self._uses[i]: continue + if x == FLAGS_memory_optimize: + print("start match var ", x, " of op ", op.type()) + print(self.pool) for index, cache_pair in enumerate(self.pool): cache_var = cache_pair[0] cache_shape = cache_pair[1] @@ -323,15 +360,13 @@ class ControlFlowGraph(object): if not compare_shape(x_shape, cache_shape, level): continue # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype] - if x_dtype != cache_dtype: - continue - if PRINT_LOG: - print(("Hit Cache !!!! cache pool index " - "is %d, var name is %s, " - "cached var name is %s, " - "var shape is %s ") % (index, x, cache_var, - str(cache_shape))) + print( + ("!!! %d, %s => %s, cache idx %d, pool size %d" + % (counter, x + str(x_shape), + cache_var + str(cache_shape), index, + len(self.pool)))) + counter += 1 self.pool.pop(index) # Rename the var to the cache var already with # memory allocated in order to reuse the memory. From 001891aea69a96a725c8026a82d5a7dd45ed558f Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 13:15:54 +0800 Subject: [PATCH 411/719] fix code style test=develop --- python/paddle/fluid/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index fd788d0929..ecf75f7282 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -104,7 +104,8 @@ def __bootstrap__(): import platform if os.name == 'nt': - third_lib_path = os.path.abspath(os.path.dirname(__file__)) + os.sep + '..' + os.sep + 'libs' + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' os.environ['path'] += ';' + third_lib_path sys.path.append(third_lib_path) From b601f2de8d3d1a336810438c521714749f8a19a6 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 13:38:08 +0800 Subject: [PATCH 412/719] include the mkl fix only test=develop --- CMakeLists.txt | 4 +- cmake/cuda.cmake | 3 - cmake/cudnn.cmake | 1 - cmake/operators.cmake | 2 +- cmake/simd.cmake | 73 ++++++++++--------- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/mixed_vector.h | 10 +-- paddle/fluid/framework/op_registry.h | 3 +- .../fluid/memory/detail/system_allocator.cc | 1 + paddle/fluid/operators/CMakeLists.txt | 7 +- paddle/fluid/operators/cum_op.h | 2 - .../elementwise/elementwise_mul_mkldnn_op.cc | 3 - .../operators/math/detail/lstm_cpu_kernel.h | 6 -- paddle/fluid/operators/math/jit_gen.h | 3 - paddle/fluid/platform/dynload/CMakeLists.txt | 2 + paddle/fluid/platform/dynload/cudnn.cc | 4 - .../fluid/platform/dynload/dynamic_loader.cc | 16 ---- python/setup.py.in | 9 +-- 18 files changed, 61 insertions(+), 91 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index efdb451f65..aa9446a694 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,10 +203,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream -include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of nccl, cupti in windows +# there is no official support of warpctc, nccl, cupti in windows +include(external/warpctc) # download, build, install warpctc include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 5be7be6413..414e92eb27 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -139,12 +139,10 @@ endfunction() message(STATUS "CUDA detected: " ${CUDA_VERSION}) if (${CUDA_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) - add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") @@ -152,7 +150,6 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") - add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 96a9917e76..09bec347db 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -89,7 +89,6 @@ if(CUDNN_FOUND) if(NOT CUDNN_MAJOR_VERSION) set(CUDNN_VERSION "???") else() - add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 70d159b4f3..2ced43f9e6 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 566dc75fda..86096d4fea 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,43 +57,46 @@ int main() return 0; }" SSE3_FOUND) -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) +# disable AVX by default on windows +if(NOT WIN32) + # Check AVX + set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) + set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; + }" AVX_FOUND) -# Check AVX 2 -set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) -set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; -}" AVX2_FOUND) + # Check AVX 2 + set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) + set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; + }" AVX2_FOUND) -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) + # Check AVX512F + set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) + set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; + }" AVX512F_FOUND) +endif(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 90083f690f..225dfb3e70 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -15,7 +15,8 @@ function(windows_symbolic TARGET) file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) add_custom_command(OUTPUT ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cc" "${final_path}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index c3a044d22c..6940250c3f 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable paddle::memory::AllocationPtr gpu_; + mutable memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 2c1648c81f..6d39bb3c52 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,8 +23,7 @@ limitations under the License. */ #include #include -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" // For VLOG() +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 307c348822..3e8fb83e9d 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -17,6 +17,7 @@ limitations under the License. */ #ifdef _WIN32 #include +#include // VirtualLock/VirtualUnlock #else #include // for mlock and munlock #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 95ad67e33e..257bfc0a3f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -44,8 +44,9 @@ endif() register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) + # warpctc_op needs cudnn 7 above -if (WITH_GPU) +if (WITH_GPU AND NOT WIN32) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() @@ -63,7 +64,9 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) +if (NOT WIN32) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) +endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 7c0fda4169..999fdcff90 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index bf9aef9135..c600d1e3d7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -19,9 +19,6 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/operators/math/jit_kernel.h" -#if defined(_WIN32) && defined(_WINSOCKAPI_) -#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ -#endif #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 2e3779ff08..ccbd05c82a 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -17,12 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" -#if defined(_WIN32) -#if defined(__AVX2__) || defined(__AVX__) -inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } -#endif -#endif - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h index 2bc740e598..6abf3434cc 100644 --- a/paddle/fluid/operators/math/jit_gen.h +++ b/paddle/fluid/operators/math/jit_gen.h @@ -18,9 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/platform/macros.h" -#if defined(_WIN32) && defined(_WINSOCKAPI_) -#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ -#endif #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 07159d4a12..5939c500c9 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,7 +16,9 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) +if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index 91d9a1ef01..f3cd3b2bbe 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -38,10 +38,6 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif -#ifdef CUDNN_DNN_ROUTINE_EACH_R6 -CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); -#endif - #ifdef CUDNN_DNN_ROUTINE_EACH_R7 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 15d5168366..cc5cda6106 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,12 +53,6 @@ namespace platform { namespace dynload { static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; -#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) -static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; -#endif - static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -171,8 +165,6 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -181,8 +173,6 @@ void* GetCublasDsoHandle() { void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); #endif @@ -203,8 +193,6 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif @@ -213,8 +201,6 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif @@ -239,8 +225,6 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); #endif diff --git a/python/setup.py.in b/python/setup.py.in index f4613dd72d..ff3aca5714 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -158,11 +158,10 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' - -package_data['paddle.libs']= [] -package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] -shutil.copy('${WARPCTC_LIBRARIES}', libs_path) - +if os.name != 'nt': + package_data['paddle.libs']= [] + package_data['paddle.libs']=['libwarpctc' + ext_name] + shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) From 41790f13662a8a86fe5b6f4e3cee7a35703230a8 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 14:04:40 +0800 Subject: [PATCH 413/719] add ut about nce --- .../unittests/test_nce_remote_table_op.py | 152 ++++-------------- 1 file changed, 33 insertions(+), 119 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index f08b270d89..e87545cb9c 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -88,158 +88,73 @@ class TestListenAndServOp(unittest.TestCase): port = int(f.read().strip()) return port - def _run_nce_op_one_pserver(self, place, port): + def _run_nce_op_two_pserver(self, place, port0, port1): scope = fluid.core.Scope() program = Program() with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): - x = scope.var('X').get_tensor() + x = scope.var('Input').get_tensor() x_array = np.random.random((4, 8)).astype("float32") * 2 x.set(x_array, place) # create and initialize Param Variable - param = scope.var('W').get_tensor() + param = scope.var('Weight').get_tensor() param_array = np.zeros((5, 8)).astype("float32") * 2 param.set(param_array, place) - path_table = scope.var('PathTable').get_tensor() - path_table_array = np.array( - [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), - (0, 2, -1, -1, -1)]).astype( - "int64" - ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - path_table.set(path_table_array, place) - - path_code = scope.var('PathCode').get_tensor() - path_code_array = np.array( - [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), - (0, 1, -1, -1, -1)]).astype("int64") #np.array to store - path_code.set(path_code_array, place) - - label = scope.var('Label').get_tensor() - label_array = np.array([0, 1, 4, 5]) - label.set(label_array, place) - bias = scope.var('Bias').get_tensor() bias_array = np.random.random((5, 1)).astype("float32") bias.set(bias_array, place) - out = scope.var('Out').get_tensor() - - pre_out = scope.var('PreOut').get_tensor - - w_out = scope.var('W_Out').get_tensor() - w_out.set(param_array, place) - - emaps = ['127.0.0.1:' + str(port)] - table_names = ['table'] - height_sections = [2] - - # create and run sgd operator - hsigmoid_op = Operator( - "hierarchical_sigmoid", - X='X', - W='W', - PathTable='PathTable', - PathCode='PathCode', - Label='Label', - Bias='Bias', - Out='Out', - PreOut='PreOut', - W_Out='W_Out', - remote_prefetch=True, - epmap=emaps, - table_names=table_names, - height_sections=height_sections) - - hsigmoid_op.run(scope, place) - - # get and compare result - result_array = np.array(w_out) - self.assertEqual(list(result_array.shape), [5, 8]) - correct = None - for i in range(5): - if i != 3: - correct = np.full((1, 8), i + 1).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - else: - correct = np.full((1, 8), 0).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - - def _run_nce_op_two_pserver(self, place, port0, port1): - scope = fluid.core.Scope() - program = Program() - with fluid.scope_guard(scope): - with program_guard(program, startup_program=Program()): - x = scope.var('X').get_tensor() - x_array = np.random.random((4, 8)).astype("float32") * 2 - x.set(x_array, place) - # create and initialize Param Variable - param = scope.var('W').get_tensor() - param_array = np.zeros((5, 8)).astype("float32") * 2 - param.set(param_array, place) - - path_table = scope.var('PathTable').get_tensor() - path_table_array = np.array( - [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), - (0, 2, -1, -1, -1)]).astype( - "int64" - ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - path_table.set(path_table_array, place) - - path_code = scope.var('PathCode').get_tensor() - path_code_array = np.array( - [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), - (0, 1, -1, -1, -1)]).astype("int64") #np.array to store - path_code.set(path_code_array, place) + sample_w = scope.var('SampleWeight').get_tensor() + sample_weight = np.random.random((4, 1)).astype("float32") + sample_w.set(sample_weight, place) label = scope.var('Label').get_tensor() label_array = np.array([0, 1, 4, 5]) label.set(label_array, place) - bias = scope.var('Bias').get_tensor() - bias_array = np.random.random((5, 1)).astype("float32") - bias.set(bias_array, place) + cost = scope.var('Cost').get_tensor() + cost_w = np.zeros((4, 1)).astype("float32") + cost.set(cost_w, place) - out = scope.var('Out').get_tensor() + sample_l = scope.var('SampleLogits').get_tensor() + sample_l_w = np.zeros((4, 3)).astype("float32") + sample_l.set(sample_l_w, place) - pre_out = scope.var('PreOut').get_tensor - - w_out = scope.var('W_Out').get_tensor() - w_out.set(param_array, place) + sample_la = scope.var('SampleLabels').get_tensor() + sample_la_w = np.zeros((4, 3)).astype("float32") + sample_la.set(sample_la_w, place) emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] table_names = ['table', 'table'] height_sections = [2, 3] - # create and run sgd operator - hsigmoid_op = Operator( - "hierarchical_sigmoid", - X='X', - W='W', - PathTable='PathTable', - PathCode='PathCode', + # create and run nce operator + nce_op = Operator( + "nce", + Input='Input', + Weight='Weight', Label='Label', Bias='Bias', - Out='Out', - PreOut='PreOut', - W_Out='W_Out', + Cost='Cost', + SampleLogits='SampleLogits', + SampleLabels='SampleLabels', + num_total_classes=5, + num_neg_samples=2, + sampler=0, + seed=1, + is_sparse=True, remote_prefetch=True, epmap=emaps, table_names=table_names, height_sections=height_sections) - hsigmoid_op.run(scope, place) + + nce_op.run(scope, place) # get and compare result - result_array = np.array(w_out) - self.assertEqual(list(result_array.shape), [5, 8]) - correct = None - for i in range(5): - if i < 2: - correct = np.full((1, 8), i + 1).astype("float32") - self.assertTrue((result_array[i] == correct).all()) - else: - correct = np.full((1, 8), i + 9).astype("float32") - self.assertTrue((result_array[i] == correct).all()) + o_cost = np.array(cost_w) + o_logits = np.array(sample_l) + o_labels = np.array(sample_la) def test_nce_op_remote(self): os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" @@ -257,7 +172,6 @@ class TestListenAndServOp(unittest.TestCase): places.append(core.CUDAPlace(0)) for place in places: - self._run_nce_op_one_pserver(place, port0) self._run_nce_op_two_pserver(place, port0, port1) # raise SIGTERM to pserver From a3fa3f85d7bd4fb948b0401d77d5c60498e5a329 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 18 Dec 2018 15:04:26 +0800 Subject: [PATCH 414/719] Polish code test=develop --- paddle/fluid/platform/enforce.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3c03a90279..d1dd09f206 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -260,12 +260,12 @@ inline void throw_on_error(T e) { #define PADDLE_JUDGE -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(cond))) { \ - ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \ - } \ +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \ + } \ } while (0) #ifndef REPLACE_ENFORCE_GLOG From 69642000dc3a83b3dad5a33052da1eff1f450b6d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 18 Dec 2018 15:09:01 +0800 Subject: [PATCH 415/719] Hide KeyHasher test=develop --- paddle/fluid/framework/scope.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 9a715ac9b9..797d110159 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -38,14 +38,6 @@ bool IsFastEagerDeletionModeEnabled(); class Scope; -namespace inner { -struct KeyHasher { - std::size_t operator()(const std::string& key) const { - return XXH32(key.c_str(), key.size(), 1); - } -}; -} // namespace inner - /** * @brief Scope that manage all variables. * @@ -110,8 +102,13 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map, - inner::KeyHasher> + struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } + }; + + mutable std::unordered_map, KeyHasher> vars_; private: From 70981f5d799b5ab1593743b6ec88af6c40698a3b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 15:30:23 +0800 Subject: [PATCH 416/719] clean test=develop --- paddle/fluid/framework/operator.cc | 36 ++++++++++++------------------ paddle/fluid/framework/operator.h | 16 ++++++------- paddle/fluid/operators/prelu_op.cc | 2 +- 3 files changed, 23 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 807667e684..7d5a6198a0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -143,14 +143,12 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { - LOG(ERROR) << "first in " << var_name_item.first << ":" << var_name; input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { - LOG(ERROR) << "first out " << var_name_item.first << ":" << var_name; output_vars.push_back(scope.FindVar(var_name)); } } @@ -441,22 +439,13 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } -Variable* ExecutionContext::OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); -} - -const Variable* ExecutionContext::FastInputVar(const std::string& name) const { - auto it = ctx_.inputs.find(name); - if (it == ctx_.inputs.end()) return nullptr; - - PADDLE_ENFORCE_LE(it->second.size(), 1UL, - "Operator %s's input %s should contain only one variable.", - op_.Type(), name); - return it->second.empty() ? nullptr : it->second[0]; +const Variable* ExecutionContext::LegacyInputVar( + const std::string& name) const { + auto ipt = op_.Input(name); + return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); } -Variable* ExecutionContext::FastOutputVar(const std::string& name) const { +Variable* ExecutionContext::OutputVar(const std::string& name) const { auto it = ctx_.outputs.find(name); if (it == ctx_.outputs.end()) return nullptr; @@ -466,15 +455,20 @@ Variable* ExecutionContext::FastOutputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } +Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } template <> -const Tensor* ExecutionContext::FastInput( +const Tensor* ExecutionContext::LegacyInput( const std::string& name) const { - return FastInput(name); + return LegacyInput(name); } template <> @@ -502,8 +496,8 @@ Tensor* ExecutionContext::Output(const std::string& name) const { } template <> -Tensor* ExecutionContext::FastOutput(const std::string& name) const { - return FastOutput(name); +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { + return LegacyOutput(name); } template <> @@ -870,7 +864,6 @@ Scope* OperatorWithKernel::PrepareData( auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); input_vars[i] = var; - LOG(ERROR) << "second in " << var_name_item.first << ":" << var_name; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -931,7 +924,6 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); - LOG(ERROR) << "second out " << var_name_item.first << ":" << var_name; } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0aad91dbee..39190d07b4 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -233,20 +233,20 @@ class ExecutionContext { } template - const T* FastInput(const std::string& name) const { - auto* var = FastInputVar(name); + const T* LegacyInput(const std::string& name) const { + auto* var = LegacyInputVar(name); return var == nullptr ? nullptr : &var->Get(); } template - T* FastOutput(const std::string& name) const { - auto var = FastOutputVar(name); + T* LegacyOutput(const std::string& name) const { + auto var = LegacyOutputVar(name); return var == nullptr ? nullptr : var->GetMutable(); } - const Variable* FastInputVar(const std::string& name) const; + const Variable* LegacyInputVar(const std::string& name) const; - Variable* FastOutputVar(const std::string& name) const; + Variable* LegacyOutputVar(const std::string& name) const; template const std::vector MultiInput(const std::string& name) const { @@ -314,7 +314,7 @@ template <> const Tensor* ExecutionContext::Input(const std::string& name) const; template <> -const Tensor* ExecutionContext::FastInput( +const Tensor* ExecutionContext::LegacyInput( const std::string& name) const; template <> @@ -325,7 +325,7 @@ template <> Tensor* ExecutionContext::Output(const std::string& name) const; template <> -Tensor* ExecutionContext::FastOutput(const std::string& name) const; +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const; template <> std::vector ExecutionContext::MultiOutput( diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index b6155ed3dd..62c55c4f55 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,7 +56,7 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.FastInput("X")->type(), + return framework::OpKernelType(ctx.Input("X")->type(), ctx.device_context()); } }; From fa135bbf525fba34e16f9c1e80a35382c5b1c983 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 15:38:33 +0800 Subject: [PATCH 417/719] Fix the mkl build script on windows test=develop --- cmake/external/mklml.cmake | 58 +++++++++---------- .../elementwise/elementwise_mul_mkldnn_op.cc | 3 + .../operators/math/detail/lstm_cpu_kernel.h | 6 ++ paddle/fluid/operators/math/jit_gen.h | 3 + python/CMakeLists.txt | 17 +----- python/setup.py.in | 6 ++ 6 files changed, 48 insertions(+), 45 deletions(-) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 3da552e319..505f8b3834 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -44,40 +44,36 @@ else() endif() SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") -if(WIN32) - MESSAGE(WARNING - "Please download the MKLML and and put it at " ${THIRD_PARTY_PATH}/install/mklml) -else() - SET(MKLML_PROJECT "extern_mklml") - IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) +IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) + MESSAGE(STATUS "use pre defined download url") + if(WIN32) + SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE) + SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.17/${MKLML_VER}.zip" CACHE STRING "" FORCE) + else() + SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) ENDIF() - MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") - SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") - SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") - - FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(MKLML)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n" - " DESTINATION ${MKLML_DST_DIR})\n") - - ExternalProject_Add( - ${MKLML_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${MKLML_SOURCE_DIR} - DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz - && tar zxf ${MKLML_VER}.tgz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} - ) endif() +SET(MKLML_PROJECT "extern_mklml") +MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") +SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") +SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") + +ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + URL ${MKLML_URL} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} && + ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR} +) INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index c600d1e3d7..bf9aef9135 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/operators/math/jit_kernel.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index ccbd05c82a..2e3779ff08 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -17,6 +17,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" +#if defined(_WIN32) +#if defined(__AVX2__) || defined(__AVX__) +inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } +#endif +#endif + namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/jit_gen.h b/paddle/fluid/operators/math/jit_gen.h index 6abf3434cc..2bc740e598 100644 --- a/paddle/fluid/operators/math/jit_gen.h +++ b/paddle/fluid/operators/math/jit_gen.h @@ -18,6 +18,9 @@ limitations under the License. */ #include #include "paddle/fluid/platform/macros.h" +#if defined(_WIN32) && defined(_WINSOCKAPI_) +#define _WINSOCK2API_ /* Prevent inclusion of winsock2.h */ +#endif #define XBYAK_USE_MMAP_ALLOCATOR #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 078d543ba2..72c0d03e52 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -49,23 +49,12 @@ IF(WIN32) # Python would use the .pyd by default under Windows series platform set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/) set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd) - if(NOT WITH_MKLDNN) - get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} - DEPENDS paddle_pybind) - else(NOT WITH_MKLDNN) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - DEPENDS paddle_pybind) - endif(NOT WITH_MKLDNN) ELSE() set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) - add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - DEPENDS paddle_pybind) ENDIF() +add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + DEPENDS paddle_pybind) add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE}) IF(WIN32) diff --git a/python/setup.py.in b/python/setup.py.in index ff3aca5714..8973d883e4 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -166,6 +166,12 @@ if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name] +else: + # copy the openblas.dll + if os.name == 'nt': + shutil.copy('${CBLAS_LIBRARIES}', libs_path) + package_data['paddle.libs']+=['openblas' + ext_name] + if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': if os.name != 'nt': From f897bd16c0e4deb683075e137e7bfe5890488205 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 15:40:23 +0800 Subject: [PATCH 418/719] clean test=develop --- paddle/fluid/framework/operator.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7d5a6198a0..8c83748668 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -812,6 +812,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); + // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext + // not Scope. Imperative mode only pass inputs and get outputs. kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { @@ -919,13 +921,6 @@ Scope* OperatorWithKernel::PrepareData( SetTensorToVariable(*var, out, trans_var); } } - for (auto& var_name_item : Outputs()) { - std::vector& output_vars = ctx->outputs[var_name_item.first]; - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto& var_name = var_name_item.second[i]; - output_vars[i] = scope.FindVar(var_name); - } - } return new_scope; } From 17fb3253c30f4ebeb8f6058a6e770344e52a0fad Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 15:41:54 +0800 Subject: [PATCH 419/719] keep the mkl win's version inconsistent with Linux's test=develop --- cmake/external/mklml.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 505f8b3834..96676f0be8 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -47,8 +47,8 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) MESSAGE(STATUS "use pre defined download url") if(WIN32) - SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE) - SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.17/${MKLML_VER}.zip" CACHE STRING "" FORCE) + SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE) + SET(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.16/${MKLML_VER}.zip" CACHE STRING "" FORCE) else() SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) From aed3872c1c5c0c9957f9567071f63a89c1ace455 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 16:17:20 +0800 Subject: [PATCH 420/719] add int cast, test=develop --- python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index e87545cb9c..5e440bf35d 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -141,6 +141,7 @@ class TestListenAndServOp(unittest.TestCase): SampleLabels='SampleLabels', num_total_classes=5, num_neg_samples=2, + custom_neg_classes=list(range(2)), sampler=0, seed=1, is_sparse=True, From dc8847af876e678e23d0c0125bedd5cfae47ec9b Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 18 Dec 2018 08:36:44 +0000 Subject: [PATCH 421/719] add examples and comments test=develop --- paddle/fluid/operators/py_func_op.cc | 77 ++++++++++++++++++---------- python/paddle/fluid/layers/nn.py | 41 +++++++++++++++ 2 files changed, 92 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 1bee3d9351..a2895b5404 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -43,9 +43,12 @@ static py::object *GetPythonCallableObject(size_t i) { return &g_py_callables[i]; } -static std::string PythonObjectToString(const py::object &py_callable) { +static std::string PythonFuncDebugString(const py::object &py_callable) { py::gil_scoped_acquire guard; - return py::str(*py_callable); + std::string wrapper_func_str = py::str(py_callable); + auto inner_func = py_callable.attr("_func"); + std::string inner_func_str = py::str(inner_func); + return inner_func_str + " wrapped by " + wrapper_func_str; } static void CallPythonFunc(py::object *callable, @@ -93,15 +96,29 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { void operator()(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(!ctx->IsRuntime(), "Infer shape cannot be called in runtime."); + + /** + * X or Out can be empty, so that py_func can be more flexible + * to support Python functions with no input or no output + */ PADDLE_ENFORCE(ctx->HasInputs("X") || ctx->HasOutputs("Out"), "Input(X) or Output(Out) must exist"); PADDLE_ENFORCE_GE(ctx->Attrs().Get(kForwardPythonCallableId), 0, "Function id cannot be less than 0"); - // Transverse all outputs - // If name of any output ends with @GRAD, - // set its shape, dtype, lod_level, type to be the same as - // the correponding forward variable + /** + * Traverse all outputs, check if name of any output ends with @GRAD. + * If found, set its shape, dtype, lod_level, type to be the same as + * the corresponding forward variable + * + * Why not get input dims from InferShapeContext? + * Because some variables in forward inputs/outputs may not be needed + * in backward. Those variables are not inside InferShapeContext. + * + * InferShape would be only called in compile time. During runtime, + * the shapes of outputs should be guaranteed by user-defined Python + * functions. + */ auto *op = boost::get(ctx->GetOp()); auto *block = op->Block(); const std::string kGradVarSuffix = framework::kGradVarSuffix; @@ -113,7 +130,7 @@ class PyFuncOpShapeInference : public framework::InferShapeBase { } auto out_name = out_var_desc->Name(); if (out_name == framework::kEmptyVarName || - out_name.size() <= kGradVarSuffix.size()) { + out_name.size() < kGradVarSuffix.size()) { continue; } @@ -152,7 +169,28 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker { } }; +/** + * There are several benefits when backward op of py_func op is + * still py_func op. + * + * - Less codes are needed, since codes of backward is almost + * the same as forward. + * + * - To support high order derivative, so that py_func is + * infinite-order differentiable + */ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { + private: + static std::string DebugString(const std::vector &strs) { + if (strs.empty()) return ""; + std::string ret = strs[0]; + for (size_t i = 1; i < strs.size(); ++i) { + ret += " "; + ret += strs[i]; + } + return ret; + } + public: using framework::GradOpDescMakerBase::GradOpDescMakerBase; @@ -207,21 +245,8 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase { // But in Python side, if IG is not needed, users can just return None auto bwd_outs = InputGrad("X", false); - if (VLOG_IS_ON(10)) { - std::string in_str = "PyFunc Grad Input: "; - for (auto &in : bwd_ins) { - in_str += in; - in_str += " "; - } - VLOG(10) << in_str; - - std::string out_str = "PyFunc Grad Output: "; - for (auto &out : bwd_outs) { - out_str += out; - out_str += " "; - } - VLOG(10) << out_str; - } + VLOG(10) << "PyFunc Grad Input: " << DebugString(bwd_ins); + VLOG(10) << "PyFunc Grad Output: " << DebugString(bwd_outs); grad_op->SetInput("X", bwd_ins); grad_op->SetOutput("Out", bwd_outs); @@ -245,6 +270,7 @@ class PyFuncOp : public framework::OperatorBase { std::vector inputs(in_arg_names.size()); for (size_t i = 0; i < in_arg_names.size(); ++i) { auto in_var = scope.FindVar(in_arg_names[i]); + // When py_func op is called in backward, in_var may be null if (in_var == nullptr) { continue; } @@ -263,15 +289,14 @@ class PyFuncOp : public framework::OperatorBase { std::vector outputs(out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { auto *out_var = scope.FindVar(out_arg_names[i]); - auto *out_tensor = + outputs[i] = out_var ? out_var->GetMutable() : nullptr; - outputs[i] = out_tensor; } auto callable_id = static_cast(Attr(kForwardPythonCallableId)); auto *py_callable = GetPythonCallableObject(callable_id); - VLOG(10) << "Call py_func_op with id " << callable_id << ": " - << PythonObjectToString(*py_callable); + VLOG(10) << "Call Python function with id " << callable_id << ": " + << PythonFuncDebugString(*py_callable); CallPythonFunc(py_callable, inputs, &outputs); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3cd0a2887e..ab3fb1e97e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9243,6 +9243,47 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): Returns: out (Variable|list(Variable)|tuple(Variable)): input :code:`out` + + Examples: + + >>> import paddle.fluid as fluid + >>> import six + >>> + >>> def create_tmp_var(name, dtype, shape): + >>> return fluid.default_main_program().current_block().create_var( + >>> name=name, dtype=dtype, shape=shape) + >>> + >>> # tanh activation has been provided by Paddle C++ op + >>> # Here, we only use tanh to be an example to show the usage + >>> # of py_func + >>> def tanh(x): + >>> return np.tanh(x) + >>> + >>> # forward input x is skipped + >>> def tanh_grad(y, dy): + >>> return np.array(dy) * (1 - np.square(np.array(y))) + >>> + >>> def debug_func(x): + >>> print(x) + >>> + >>> def simple_net(img, label): + >>> hidden = img + >>> for idx in six.moves.range(4): + >>> hidden = fluid.layers.fc(hidden, size=200) + >>> new_hidden = create_tmp_var(name='hidden_{}'.format(idx), + >>> dtype=hidden.dtype, shape=hidden.shape) + >>> + >>> # user-defined layers with forward and backward + >>> hidden = fluid.layers.py_func(func=tanh, x=hidden, + >>> out=new_hidden, backward_func=tanh_grad, + >>> skip_vars_in_backward_input=hidden) + >>> + >>> # user-defined debug layers to print variables + >>> fluid.layers.py_func(func=debug_func, x=hidden, out=None) + >>> + >>> prediction = fluid.layers.fc(hidden, size=10, act='softmax') + >>> loss = fluid.layers.cross_entropy(input=prediction, label=label) + >>> return fluid.layers.mean(loss) """ helper = LayerHelper('py_func', **locals()) if x is None: From b5fa916413aebd0d35af8b3ae04d4d555ecb4629 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 08:38:52 +0000 Subject: [PATCH 422/719] fix bug after merge reyoung optimization, test=develop --- .../fluid/operators/hierarchical_sigmoid_op.h | 1 - .../fluid/operators/math/matrix_bit_code.cc | 35 ------------------- paddle/fluid/operators/math/matrix_bit_code.h | 29 +++++++-------- 3 files changed, 15 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 802b444d7c..b47bf49ecb 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -71,7 +71,6 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { // server auto height_sections = ctx.Attr>("height_sections"); auto table_names = ctx.Attr>("table_names"); - VLOG(3) << "path type is " << path->type().name(); std::vector real_rows = PathToRows(*path); framework::Scope& local_scope = ctx.scope().NewScope(); auto* ids = local_scope.Var("Ids@Prefetch"); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index d55e832cc2..d6f51c6e5c 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -84,41 +84,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, code_table_.apply_visitor(func); } -template -struct MatrixBitCodeFunctorSelectedRowsAddGrad - : public boost::static_visitor { - const framework::Tensor &tmat_; - framework::SelectedRows *vec_; - - MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) - : tmat_(tmat), vec_(vec) {} - - template - void operator()(const CodeTable &code_table) { - size_t batch_size = tmat_.dims()[0]; - size_t width = tmat_.dims()[1]; - auto *vec_data = vec_->mutable_value()->template data(); - auto *tmat_data = tmat_.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table.get_code(i); - int code_length = code.get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); - int64_t row_index = vec_->GetIndexFromId(static_cast(index)); - vec_data[row_index] += tmat_data[i * width + j]; - } - } - } -}; - -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) { - MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); - code_table_.apply_visitor(func); -} - template struct MatrixBitCodeFunctorSum : public boost::static_visitor { const framework::Tensor &tmat_; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 7a084a41e5..c399cb5d44 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -124,11 +124,12 @@ class SimpleCode { template class CustomCode { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) { - seq_len_ = ptable.dims()[1]; - ptable_data_ = ptable.data() + seq_len_ * index; - pcode_data_ = pcode.data() + seq_len_ * index; + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, + int index) { + seq_len_ = path_table.dims()[1]; + path_table_data_ = path_table.data() + seq_len_ * index; + path_code_data_ = path_code.data() + seq_len_ * index; } /** * Here the id of root should be 1 rather than 0, thus the encoding of class c @@ -139,25 +140,25 @@ class CustomCode { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_data_[bit]; } - bool calc_bit(int bit) const { return pcode_data_[bit]; } + size_t calc_index(int bit) const { return path_table_data_[bit]; } + bool calc_bit(int bit) const { return path_code_data_[bit]; } // NOTE: this function is not thread-safe. int get_length() const { if (length_ < 0) { auto len = seq_len_; - length_ = - static_cast(std::find_if(ptable_data_, ptable_data_ + len, - [](const T& val) { return val < 0; }) - - ptable_data_); + length_ = static_cast( + std::find_if(path_table_data_, path_table_data_ + len, + [](const T& val) { return val < 0; }) - + path_table_data_); } return length_; } private: int64_t seq_len_; - const T* ptable_data_; - const T* pcode_data_; + const T* path_table_data_; + const T* path_code_data_; mutable int length_{-1}; }; @@ -214,7 +215,7 @@ class MatrixBitCodeFunctor { const framework::Tensor& path_code, const int64_t* ids) : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ From fdab7f749e68c86cc732c7570bbe327d630f5dc9 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 17:04:48 +0800 Subject: [PATCH 423/719] fix the setup script issue test=develop --- python/setup.py.in | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index 8973d883e4..bfbaa1d015 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -138,8 +138,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF': '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]} -if os.name == 'nt': - package_data['paddle.fluid'] += ['openblas' + ext_name] if '${WITH_FLUID_ONLY}'== 'OFF': package_data['paddle.v2.master']=['libpaddle_master' + ext_name] @@ -162,15 +160,16 @@ if os.name != 'nt': package_data['paddle.libs']= [] package_data['paddle.libs']=['libwarpctc' + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name] else: - # copy the openblas.dll if os.name == 'nt': - shutil.copy('${CBLAS_LIBRARIES}', libs_path) - package_data['paddle.libs']+=['openblas' + ext_name] + # copy the openblas.dll + shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path) + package_data['paddle.fluid'] += ['openblas' + ext_name] if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': From a500dfa579907d8046e40a15e67558c350498976 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 18 Dec 2018 06:27:32 +0000 Subject: [PATCH 424/719] rewrite ddim test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/array.h | 74 ++- paddle/fluid/framework/ddim.cc | 303 ++++-------- paddle/fluid/framework/ddim.h | 148 ++++-- paddle/fluid/framework/dim.h | 441 ++++++------------ paddle/fluid/framework/dlpack_tensor.cc | 6 +- paddle/fluid/framework/dlpack_tensor.h | 2 +- paddle/fluid/framework/unroll_array_ops.h | 169 +++++++ .../fluid/operators/controlflow/logical_op.cc | 2 - paddle/fluid/operators/crop_op.h | 1 - paddle/fluid/operators/cudnn_lstm_op.cu.cc | 1 - .../fluid/operators/detail/strided_memcpy.h | 38 +- .../detection/generate_proposal_labels_op.cc | 2 - .../detection/generate_proposals_op.cc | 6 - .../detection/rpn_target_assign_op.cc | 1 - .../operators/elementwise/elementwise_op.h | 1 - paddle/fluid/operators/expand_op.h | 1 - paddle/fluid/operators/fc_op.cc | 1 - .../fused/fused_embedding_fc_lstm_op.cc | 18 +- paddle/fluid/operators/hinge_loss_op.cc | 1 - paddle/fluid/operators/log_loss_op.cc | 1 - .../fluid/operators/math/math_function_impl.h | 3 - paddle/fluid/operators/math/softmax_impl.h | 1 - .../fluid/operators/modified_huber_loss_op.cc | 1 - paddle/fluid/operators/mul_op.cc | 6 - paddle/fluid/operators/nce_op.cc | 1 - paddle/fluid/operators/norm_op.h | 1 - paddle/fluid/operators/psroi_pool_op.h | 1 - .../sequence_ops/sequence_slice_op.h | 2 - paddle/fluid/operators/strided_memcpy.h | 2 +- 30 files changed, 622 insertions(+), 615 deletions(-) create mode 100644 paddle/fluid/framework/unroll_array_ops.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d7a69c8c9..023118d740 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -36,7 +36,7 @@ add_subdirectory(details) proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) -cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h index be9efcd749..aa0abc22a6 100644 --- a/paddle/fluid/framework/array.h +++ b/paddle/fluid/framework/array.h @@ -15,34 +15,88 @@ #pragma once #include -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/framework/unroll_array_ops.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { + template class Array { - static_assert(N > 0, "The size of array must be larger than 0"); - public: - HOSTDEVICE Array() {} + static constexpr size_t kSize = N; - HOSTDEVICE explicit Array(const T &val) { - for (size_t i = 0; i < N; ++i) data_[i] = val; + HOSTDEVICE inline Array() = default; + + template + HOSTDEVICE inline explicit Array(const T &val, Args... args) { + UnrollVarArgsAssign::Run(data_, val, args...); } - HOSTDEVICE const T *Get() const { return data_; } + HOSTDEVICE inline void Fill(const T &val) { + UnrollFillConstant::Run(data_, val); + } - HOSTDEVICE T *GetMutable() { return data_; } + HOSTDEVICE inline const T *Get() const { return data_; } - HOSTDEVICE T &operator[](size_t index) { return data_[index]; } + HOSTDEVICE inline T *GetMutable() { return data_; } - HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; } + HOSTDEVICE inline T &operator[](size_t index) { return data_[index]; } + + HOSTDEVICE inline const T &operator[](size_t index) const { + return data_[index]; + } HOSTDEVICE constexpr size_t size() const { return N; } + HOSTDEVICE inline bool operator==(const Array &other) const { + return UnrollCompare::Run(data_, other.data_); + } + + HOSTDEVICE inline bool operator!=(const Array &other) const { + return !(*this == other); + } + private: T data_[N]; }; +template +class Array { + public: + static constexpr size_t kSize = 0; + + HOSTDEVICE inline Array() = default; + + HOSTDEVICE inline void Fill(const T &val) {} + + HOSTDEVICE inline constexpr T *Get() const { return nullptr; } + + // Add constexpr to GetMutable() cause warning in MAC + HOSTDEVICE inline T *GetMutable() { return nullptr; } + + HOSTDEVICE inline T &operator[](size_t index) { +#ifndef __CUDA_ARCH__ + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE inline const T &operator[](size_t index) const { +#ifndef __CUDA_ARCH__ + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE constexpr size_t size() const { return 0; } + + HOSTDEVICE constexpr bool operator==(const Array &other) const { + return true; + } + + HOSTDEVICE constexpr bool operator!=(const Array &other) const { + return false; + } +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 05e423b8a5..3640138e18 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -18,201 +18,131 @@ limitations under the License. */ namespace paddle { namespace framework { -/// @cond HIDDEN +template +struct DDimAssignFunctor { + static_assert(std::is_integral::value, "T must be integral type"); + using result_type = void; + explicit DDimAssignFunctor(const T* in) : in_(in) {} -template -Dim make_dim(const int64_t* d) { - return Dim(*d, make_dim(d + 1)); -} + template + inline void operator()(Dim& dim) { // NOLINT + UnrollAssign::Run(in_, dim.data()); + } + + const T* in_; +}; -template <> -Dim<0> make_dim<0>(const int64_t* d) { - return Dim<0>(*d); +DDim::DDim(const int* d, int n) : rank_(n) { + this->apply_visitor(DDimAssignFunctor(d)); } -void make_ddim(DDim& ddim, const int64_t* dims, int n) { - switch (n) { - case 0: - ddim = make_dim<0>(dims); - break; - case 1: - ddim = make_dim<1>(dims); - break; - case 2: - ddim = make_dim<2>(dims); - break; - case 3: - ddim = make_dim<3>(dims); - break; - case 4: - ddim = make_dim<4>(dims); - break; - case 5: - ddim = make_dim<5>(dims); - break; - case 6: - ddim = make_dim<6>(dims); - break; - case 7: - ddim = make_dim<7>(dims); - break; - case 8: - ddim = make_dim<8>(dims); - break; - case 9: - ddim = make_dim<9>(dims); - break; - default: - PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); - } +DDim::DDim(const int64_t* d, int n) : rank_(n) { + this->apply_visitor(DDimAssignFunctor(d)); } -/// @endcond +template +Dim make_dim(const int64_t* d) { + Dim ret; + for (int i = 0; i < N; ++i) ret[i] = d[i]; + return ret; +} DDim make_ddim(std::initializer_list dims) { - DDim result(make_dim(0)); - make_ddim(result, dims.begin(), dims.size()); - return result; + return DDim(dims.begin(), dims.size()); } DDim make_ddim(const std::vector& dims) { - DDim result(make_dim(0)); - make_ddim(result, &dims[0], dims.size()); - return result; + return DDim(dims.data(), dims.size()); } DDim make_ddim(const std::vector& dims) { - std::vector res(dims.size()); - std::transform(dims.begin(), dims.end(), res.begin(), - [](int d) { return static_cast(d); }); - return make_ddim(res); + return DDim(dims.data(), dims.size()); } -/// @cond HIDDEN -// XXX For some reason, putting this in an anonymous namespace causes errors -class DynamicMutableIndexer : public boost::static_visitor { - public: - explicit DynamicMutableIndexer(int idx) : idx_(idx) {} - - template - int64_t& operator()(Dim& dim) const { - return dim[idx_]; - } - - private: - int idx_; -}; - -class DynamicConstIndexer : public boost::static_visitor { - public: - explicit DynamicConstIndexer(int idx) : idx_(idx) {} +struct DDimEqualityVisitor { + explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {} template - int64_t operator()(const Dim& dim) const { - return dim[idx_]; + inline bool operator()(const Dim& self) const { + return UnrollCompare::Run(self.data(), d_); } - private: - int idx_; + const int64_t* d_; }; -/// @endcond - -int64_t& DDim::operator[](int idx) { - return boost::apply_visitor(DynamicMutableIndexer(idx), var); -} - -int64_t DDim::operator[](int idx) const { - return boost::apply_visitor(DynamicConstIndexer(idx), var); +bool DDim::operator==(const DDim& d) const { + return rank_ == d.rank_ && this->apply_visitor(DDimEqualityVisitor(d.data())); } -int DDim::size() const { return arity(*this); } - -bool DDim::operator==(DDim d) const { - if (var.which() != d.getVar().which()) { - return false; - } else { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); +bool DDim::operator!=(const DDim& d) const { return !(*this == d); } - for (unsigned int i = 0; i < v1.size(); i++) { - if (v1[i] != v2[i]) { - return false; - } - } +struct DDimPlusVisitor { + explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} - return true; + template + inline void operator()(Dim& self) const { + UnrollAdd::Run(d1_, d2_, self.data()); } -} - -bool DDim::operator!=(DDim d) const { return !(*this == d); } - -DDim DDim::operator+(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - std::vector v3; - - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] + v2[i]); - } + const int64_t* d1_; + const int64_t* d2_; +}; - return make_ddim(v3); +DDim DDim::operator+(const DDim& d) const { + PADDLE_ENFORCE(rank_ == d.rank_); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimPlusVisitor(data(), d.data())); + return ret; } -DDim DDim::operator*(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); +struct DDimMulVisitor { + explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} - std::vector v3; - - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] * v2[i]); + template + inline void operator()(Dim& self) const { + UnrollMul::Run(d1_, d2_, self.data()); } - return make_ddim(v3); + const int64_t* d1_; + const int64_t* d2_; +}; + +DDim DDim::operator*(const DDim& d) const { + PADDLE_ENFORCE(rank_ == d.rank_); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimMulVisitor(data(), d.data())); + return ret; } int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } -void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } - -/// @cond HIDDEN -struct VectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - - explicit VectorizeVisitor(std::vector& v) : vector(v) {} - - template - void operator()(const T& t) { - vector.push_back(t.head); - this->operator()(t.tail); - } - - void operator()(const Dim<0>& t) {} -}; -/// @endcond +void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } // NOLINT std::vector vectorize(const DDim& ddim) { - std::vector result; - VectorizeVisitor visitor(result); - boost::apply_visitor(visitor, ddim); + std::vector result(DDim::kMaxRank); + for (int i = 0; i < ddim.size(); ++i) { + result[i] = ddim[i]; + } + result.resize(ddim.size()); return result; } // NOTE: framework::vectorize converts to type int64_t // which does not fit cudnn inputs. std::vector vectorize2int(const DDim& ddim) { - std::vector temp = vectorize(ddim); - std::vector result(temp.begin(), temp.end()); + std::vector result(DDim::kMaxRank); + for (int i = 0; i < ddim.size(); ++i) { + result[i] = ddim[i]; + } + result.resize(ddim.size()); return result; } -struct ProductVisitor : public boost::static_visitor { +struct ProductVisitor { template int64_t operator()(const Dim& dim) { return product(dim); @@ -220,65 +150,27 @@ struct ProductVisitor : public boost::static_visitor { }; int64_t product(const DDim& ddim) { - ProductVisitor visitor; - return boost::apply_visitor(visitor, ddim); + return ddim.apply_visitor(ProductVisitor()); } -struct SliceVectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - int begin; - int end; - - SliceVectorizeVisitor(std::vector& v, int b, int e) - : vector(v), begin(b), end(e) { - PADDLE_ENFORCE(begin < end, - "Begin index must be less than end index in ddim slice."); - PADDLE_ENFORCE(begin >= 0, - "Begin index can't be less than zero in ddim slice."); - } - - template - void operator()(const Dim& dim) { - if (begin == 0) { - vector.push_back(dim.head); - } else { - --begin; - } - --end; - if (end > 0) { - this->operator()(dim.tail); - } - } - - void operator()(const Dim<0>& dim) { - PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound."); - } -}; - DDim slice_ddim(const DDim& dim, int begin, int end) { - std::vector vec; - vec.reserve(end - begin); - SliceVectorizeVisitor visitor(vec, begin, end); - boost::apply_visitor(visitor, dim); - return make_ddim(vec); -} - -/// \cond HIDDEN - -struct ArityVisitor : boost::static_visitor { - template - int operator()(Dim) const { - return D; + PADDLE_ENFORCE(begin < end, + "Begin index must be less than end index in ddim slice."); + PADDLE_ENFORCE(begin >= 0, + "Begin index can't be less than zero in ddim slice."); + DDim ret; + ret.rank_ = end - begin; + for (int i = 0; i < ret.rank_; ++i) { + ret[i] = dim[i + begin]; } -}; - -/// \endcond + return ret; +} -int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); } +int arity(const DDim& d) { return d.size(); } /// \cond HIDDEN -struct DDimPrinter : boost::static_visitor { +struct DDimPrinter { std::ostream& os; explicit DDimPrinter(std::ostream& os_) : os(os_) {} @@ -291,15 +183,10 @@ struct DDimPrinter : boost::static_visitor { /// \endcond std::ostream& operator<<(std::ostream& os, const DDim& ddim) { - DDimPrinter printer(os); - boost::apply_visitor(printer, ddim); + ddim.apply_visitor(DDimPrinter(os)); return os; } -DDim::DDim(std::initializer_list init_list) { - *this = make_ddim(init_list); -} - DDim flatten_to_2d(const DDim& src, int num_col_dims) { int rank = src.size(); return make_ddim({product(slice_ddim(src, 0, num_col_dims)), @@ -309,21 +196,23 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims) { DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } DDim stride(const DDim& ddim) { - std::vector strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = 1; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i + 1]; } - return framework::make_ddim(strides); + return strides; } DDim stride_numel(const framework::DDim& ddim) { - std::vector strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = ddim[ddim.size() - 1]; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i]; } - return framework::make_ddim(strides); + return strides; } } // namespace framework diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index f05b5ee3fa..bff710040e 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -18,8 +18,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/dim.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { @@ -29,51 +27,138 @@ namespace framework { * * The number of dimensions must be between [1, 9]. */ -struct DDim { - typedef boost::variant, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, - Dim<7>, Dim<8>, Dim<9>> - DDimVar; - DDimVar var; +class DDim { + public: + constexpr static int kMaxRank = 9; - DDim() : var(Dim<1>()) {} + DDim() : rank_(1) { dim_[0] = 0; } + + DDim(const int* d, int n); + DDim(const int64_t* d, int n); template - explicit DDim(const Dim& in) : var(in) {} + /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT + UnsafeCast() = in; + } - /*implicit*/ DDim(std::initializer_list init_list); + /*implicit*/ DDim(std::initializer_list init_list) + : DDim(init_list.begin(), init_list.size()) {} template - DDim& operator=(const Dim& in) { - var = in; + inline DDim& operator=(const Dim& in) { + rank_ = D; + UnsafeCast() = in; return *this; } - int64_t& operator[](int idx); - int64_t operator[](int idx) const; + inline int64_t& operator[](int idx) { return dim_[idx]; } - template - typename Visitor::result_type apply_visitor(Visitor& visitor) { - return var.apply_visitor(visitor); + inline int64_t operator[](int idx) const { return dim_[idx]; } + + inline int64_t& at(int idx) { + PADDLE_ENFORCE(idx >= 0 && idx < rank_); + return dim_[idx]; } - template - typename Visitor::result_type apply_visitor(Visitor& visitor) const { - return var.apply_visitor(visitor); + inline int64_t at(int idx) const { + PADDLE_ENFORCE(idx >= 0 && idx < rank_); + return dim_[idx]; } - DDimVar getVar() { return var; } + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor); + + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) const; + + bool operator==(const DDim& d) const; + + bool operator!=(const DDim& d) const; + + DDim operator+(const DDim& d) const; - bool operator==(DDim d) const; + DDim operator*(const DDim& d) const; - bool operator!=(DDim d) const; + // Make DDim act like std::vector + using iterator = int64_t*; + using const_iterator = const int64_t*; - DDim operator+(DDim d) const; + int64_t* data() { return dim_.data(); } + const int64_t* data() const { return dim_.data(); } - DDim operator*(DDim d) const; + iterator begin() { return data(); } + const_iterator begin() const { return data(); } + iterator end() { return data() + rank_; } + const_iterator end() const { return data() + rank_; } + + int size() const { return rank_; } + + private: + template + inline Dim& UnsafeCast() { + return const_cast&>(const_cast(this)->UnsafeCast()); + } - int size() const; + template + inline const Dim& UnsafeCast() const { + static_assert(M >= 0 && M <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + friend DDim slice_ddim(const DDim& dim, int begin, int end); + friend DDim stride(const DDim& ddim); + friend DDim stride_numel(const DDim& ddim); + + Dim dim_; + int rank_; }; +#define PADDLE_VISIT_DDIM(rank) \ + case rank: \ + return visitor(UnsafeCast()) + +template +typename std::result_of&)>::type DDim::apply_visitor( + Visitor&& visitor) { + switch (rank_) { + PADDLE_VISIT_DDIM(0); + PADDLE_VISIT_DDIM(1); + PADDLE_VISIT_DDIM(2); + PADDLE_VISIT_DDIM(3); + PADDLE_VISIT_DDIM(4); + PADDLE_VISIT_DDIM(5); + PADDLE_VISIT_DDIM(6); + PADDLE_VISIT_DDIM(7); + PADDLE_VISIT_DDIM(8); + PADDLE_VISIT_DDIM(9); + default: + PADDLE_THROW("Invalid rank %d", rank_); + } +} + +template +typename std::result_of&)>::type DDim::apply_visitor( + Visitor&& visitor) const { + switch (rank_) { + PADDLE_VISIT_DDIM(0); + PADDLE_VISIT_DDIM(1); + PADDLE_VISIT_DDIM(2); + PADDLE_VISIT_DDIM(3); + PADDLE_VISIT_DDIM(4); + PADDLE_VISIT_DDIM(5); + PADDLE_VISIT_DDIM(6); + PADDLE_VISIT_DDIM(7); + PADDLE_VISIT_DDIM(8); + PADDLE_VISIT_DDIM(9); + default: + PADDLE_THROW("Invalid rank %d", rank_); + } +} +#undef PADDLE_VISIT_DDIM + /** * \brief Make a DDim from std::vector * @@ -92,7 +177,7 @@ DDim make_ddim(const std::vector& dims); DDim make_ddim(std::initializer_list dims); int64_t get(const DDim& dim, int idx); -void set(DDim& dim, int idx, int val); +void set(DDim& dim, int idx, int val); // NOLINT std::vector vectorize(const DDim& ddim); std::vector vectorize2int(const DDim& ddim); @@ -129,12 +214,3 @@ DDim stride(const DDim& ddim); DDim stride_numel(const DDim& ddim); } // namespace framework } // namespace paddle - -namespace boost { - -template -T get(const paddle::framework::DDim& in) { - return boost::get(in.var); -} - -} // namespace boost diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 73f92fa389..3ae60a3119 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -16,328 +16,184 @@ #include #include #include +#include #include +#include "paddle/fluid/framework/array.h" #include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { namespace framework { // Statically sized, statically indexed dimension -template -struct Dim { - static constexpr int dimensions = i; +template +class Dim : public Array { + public: + static_assert(N >= 0, "N must be not less than 0"); - template - HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { - static_assert(sizeof...(_tail) == i - 1, - "Dim initialized with the wrong number of parameters"); - } + static constexpr int kRank = N; + using BaseClass = Array; - HOSTDEVICE - Dim(int64_t _head, const Dim& _tail) : head(_head), tail(_tail) {} + inline Dim(int64_t head, const Dim& tail) { + (*this)[0] = head; + new (this->GetMutable() + 1) Dim(tail); + } - HOSTDEVICE - Dim() : head(0), tail() {} + template + HOSTDEVICE explicit Dim(int64_t head, Args... args) + : BaseClass(head, args...) {} /** Construct a Dim from a linear index and size. Uses Fortran order * indexing. */ - HOSTDEVICE - Dim(int64_t idx, const Dim& size) - : head(idx % size.head), tail(idx / size.head, size.tail) {} + HOSTDEVICE Dim(int64_t idx, const Dim& size); /** Construct a Dim with each dimension set to the given index */ - HOSTDEVICE - Dim(int64_t idx) : head(idx), tail(idx) {} + HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } - HOSTDEVICE - bool operator==(const Dim& o) const { - return (head == o.head) && (tail == o.tail); - } + HOSTDEVICE Dim() = default; - HOSTDEVICE - bool operator!=(const Dim& o) const { return !(*this == o); } + HOSTDEVICE int64_t* data() { return this->GetMutable(); } - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; + HOSTDEVICE const int64_t* data() const { return this->Get(); } HOST std::string to_string() const; - - int64_t head; - Dim tail; -}; - -// Base case specialization -template <> -struct Dim<0> { - static constexpr int dimensions = 0; - - HOSTDEVICE - Dim(int64_t _head) {} - - HOSTDEVICE - Dim() {} - - HOSTDEVICE - Dim(int idx, const Dim<0>& size) { -#ifndef __CUDA_ARCH__ - if (idx > 0) { - throw std::invalid_argument("Index out of range."); - } -#else - PADDLE_ASSERT(idx == 0); -#endif - } - - HOSTDEVICE - bool operator==(const Dim<0>& o) const { return true; } - - HOSTDEVICE - bool operator!=(const Dim<0>& o) const { return false; } - - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; }; -namespace { - -// Helper for accessing Dim classes -template -struct DimGetter { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return DimGetter::impl(d.tail); - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return DimGetter::impl(d.tail); +namespace detail { +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) { + out[kStart] = (*idx) % in[kStart]; + (*idx) /= in[kStart]; + FortranOrderIndexingConstructorFunctor::Run(in, idx, + out); } }; -// Eureka! We found the element! -template <> -struct DimGetter<0> { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return d.head; - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return d.head; - } +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) {} }; +} // namespace detail -template -HOSTDEVICE int64_t& indexer(Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); +template +HOSTDEVICE Dim::Dim(int64_t idx, const Dim& size) { + detail::FortranOrderIndexingConstructorFunctor<0, N, N == 0>::Run( + size.Get(), &idx, this->GetMutable()); } -template <> -HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif +template +HOSTDEVICE inline int64_t get(const Dim& dim) { + return dim[idx]; } -template -HOSTDEVICE int64_t indexer(const Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); -} - -template <> -HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif -} - -} // namespace -// Static access to constant Dim -template -HOSTDEVICE int64_t get(const Dim& d) { - return DimGetter::impl(d); -} - -// Static access to mutable Dim -template -HOSTDEVICE int64_t& get(Dim& d) { - return DimGetter::impl(d); -} - -// Dynamic access to constant Dim -template -HOSTDEVICE int64_t Dim::operator[](int i) const { - return indexer(*this, i); +template +HOSTDEVICE inline int64_t& get(Dim& dim) { // NOLINT + return dim[idx]; } -// Dynamic access to mutable Dim -template -HOSTDEVICE int64_t& Dim::operator[](int i) { - return indexer(*this, i); +template +HOSTDEVICE inline int64_t get(const Dim& dim, int idx) { + return dim[idx]; } -// Dynamic access to constant Dim -inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const { - return indexer(*this, i); -} - -// Dynamic access to mutable Dim -inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) { - return indexer(*this, i); -} - -// Dynamic access to constant Dim -// without std::enable_if will try to instantiate this on get<0>(d) -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim& d, - int i) { - return d[i]; -} - -// Dynamic access to mutable Dim -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim& d, - int i) { - return d[i]; +template +HOSTDEVICE inline int64_t& get(Dim& dim, int idx) { // NOLINT + return dim[idx]; } // Dot product of two dims -template -HOSTDEVICE int64_t linearize(const Dim& a, const Dim& b) { - return a.head * b.head + linearize(a.tail, b.tail); -} - -// Base case dot product of two Dims -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) { - return 0; +template +HOSTDEVICE inline int64_t linearize(const Dim& a, const Dim& b) { + return UnrollProduct::Run(a.Get(), b.Get()); } // Product of a Dim -template -HOSTDEVICE int64_t product(const Dim& a, int prod = 1) { - return prod * a.head * product(a.tail); -} - -// Base case product of a Dim -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) { - return prod; +template +HOSTDEVICE inline int64_t product(const Dim& a) { + return UnrollProduct::Run(a.Get()); } // Is 0 <= idx_i < size_i for all i? -template -HOSTDEVICE bool contained(const Dim& idx, const Dim& size) { - return ((0 <= idx.head) && (idx.head < size.head) && - contained(idx.tail, size.tail)); -} +namespace detail { +template +struct ContainedFunctor { + HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) { + return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) && + ContainedFunctor::Run(idx, + size); + } +}; + +template +struct ContainedFunctor { + HOSTDEVICE static constexpr inline bool Run(const int64_t* idx, + const int64_t* size) { + return true; + } +}; +} // namespace detail -// Base case of is 0 <= idx_i < size_i ? -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) { - return true; +template +HOSTDEVICE inline bool contained(const Dim& idx, const Dim& size) { + return detail::ContainedFunctor<0, N, N == 0>::Run(idx.Get(), size.Get()); } /** * \brief Compute exclusive prefix-multiply of a Dim. */ -template -HOSTDEVICE Dim ex_prefix_mul(const Dim& src, int mul = 1) { - return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); -} +namespace detail { +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) { + kStart == 0 ? out[kStart] = 1 : out[kStart] = + out[kStart - 1] * in[kStart - 1]; + detail::ExPrefixMulFunctor::Run(in, + out); + } +}; + +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {} +}; +} // namespace detail -///\cond HIDDEN -// Base case of ex_prefix_mul -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) { - return Dim<0>(); +template +HOSTDEVICE inline Dim ex_prefix_mul(const Dim& src) { + Dim ret; + detail::ExPrefixMulFunctor<0, N, N == 0>::Run(src.Get(), ret.GetMutable()); + return ret; } -///\endcond /** * Add two dimensions together */ -template -HOSTDEVICE Dim dim_plus(const Dim& a, const Dim& b) { - return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); +template +HOSTDEVICE inline Dim dim_plus(const Dim& a, const Dim& b) { + Dim ret; + UnrollAdd::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); -} - -template -HOSTDEVICE Dim operator+(const Dim& lhs, const Dim& rhs) { +template +HOSTDEVICE inline Dim operator+(const Dim& lhs, const Dim& rhs) { return dim_plus(lhs, rhs); } /** * Multiply two dimensions together */ -template -HOSTDEVICE Dim dim_mult(const Dim& a, const Dim& b) { - return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); -} - -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); +template +HOSTDEVICE inline Dim dim_mult(const Dim& a, const Dim& b) { + Dim ret; + UnrollMul::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } template @@ -354,23 +210,32 @@ HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { * \return Dim object the same size as \p size with normalized strides * */ +namespace detail { +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) { + ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]); + NormalizeStridesFunctor::Run( + size, stride, ret); + } +}; -template -HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { - int norm_stride = size.head == 1 ? 0 : stride.head; - return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); -} - -///\cond HIDDEN +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) {} +}; +} // namespace detail -template <> -HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, - const Dim<0>& stride) { - return Dim<0>(); +template +HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { + Dim ret; + detail::NormalizeStridesFunctor<0, N, N == 0>::Run(size.Get(), stride.Get(), + ret.GetMutable()); + return ret; } -///\endcond - /** * Helper function to create a Dim * @@ -379,25 +244,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, */ template -HOSTDEVICE Dim make_dim(Args... idxes) { +HOSTDEVICE inline Dim make_dim(Args... idxes) { return Dim(idxes...); } // Allows us to output a Dim -// XXX For some reason, overloading fails to resolve this correctly -template -typename std::enable_if<(i > 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head << ", " << d.tail; - return os; -} - -// Base case that allows us to output a Dim -// XXX I wish this could be an overload instead of a template -template -typename std::enable_if<(i == 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head; +template +inline std::ostream& operator<<(std::ostream& os, const Dim& d) { + os << d[0]; + for (int i = 1; i < N; ++i) { + os << ", " << d[i]; + } return os; } @@ -405,25 +262,23 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { return os; } -template -HOST std::string Dim::to_string() const { +template +HOST std::string Dim::to_string() const { std::stringstream stream; - stream << *this; - return stream.str(); } -template -HOSTDEVICE Dim linear_to_dimension(int linear_index, Dim extents) { - Dim result; +template +HOSTDEVICE Dim linear_to_dimension(int linear_index, const Dim& extents) { + Dim result; - for (int i = 0; i < D - 1; ++i) { + for (int i = 0; i < N - 1; ++i) { result[i] = linear_index % extents[i]; linear_index /= extents[i]; } - result[D - 1] = linear_index; + result[N - 1] = linear_index; return result; } diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 04e3f78afe..5014fcd06a 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -62,7 +62,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CPUPlace &place) const { - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPU; ctx.device_id = 0; return ctx; @@ -70,7 +70,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLGPU; ctx.device_id = place.device; return ctx; @@ -81,7 +81,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPUPinned; ctx.device_id = 0; return ctx; diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index 0c52bce1ef..e48b0d5c88 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -38,7 +38,7 @@ class DLPackTensor { // The shape in DLTensor is defined as int64_t* // Add this member to make TVMTensor init without heap allocation - ShapeType shape_[9]; + ShapeType shape_[DDim::kMaxRank]; }; } // namespace framework diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h new file mode 100644 index 0000000000..fb0a89530f --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops.h @@ -0,0 +1,169 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +namespace detail { + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) { + data[kStart] = val; + UnrollFillConstant::Run(data, val); + } +}; + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) {} +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) { + d2[kStart] = static_cast(d1[kStart]); + UnrollAssign::Run(d1, d2); + } +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {} +}; + +template +struct UnrollVarArgsAssign { + template + HOSTDEVICE inline static void Run(T *d, T val, Args... args) { + static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument"); + d[kStart] = val; + UnrollVarArgsAssign::Run(d, + args...); + } +}; + +template +struct UnrollVarArgsAssign { + HOSTDEVICE inline static void Run(T *d) {} +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline static bool Run(const T *d1, const T *d2) { + return d1[kStart] == d2[kStart] && + UnrollCompare::Run(d1, d2); + } +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) { + return true; + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] + d2[kStart]; + UnrollAdd::Run(d1, d2, d3); + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] * d2[kStart]; + UnrollMul::Run(d1, d2, d3); + } +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline static T Run(const T *d) { + return d[kStart] * + UnrollProduct::Run(d); + } + + template + HOSTDEVICE inline static T Run(const T *d1, const T *d2) { + return d1[kStart] * d2[kStart] + + UnrollProduct::Run(d1, d2); + } +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline constexpr static T Run(const T *d) { + return 1; + } + + template + HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) { + return 0; + } +}; + +} // namespace detail + +template +using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>; + +template +using UnrollAssign = detail::UnrollAssign<0, N, N == 0>; + +template +using UnrollVarArgsAssign = detail::UnrollVarArgsAssign; + +template +using UnrollCompare = detail::UnrollCompare<0, N, N == 0>; + +template +using UnrollAdd = detail::UnrollAdd<0, N, N == 0>; + +template +using UnrollMul = detail::UnrollMul<0, N, N == 0>; + +template +using UnrollProduct = detail::UnrollProduct<0, N, N == 0>; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index 6446cab5ec..2e7f3edd55 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase { OpComment comment; PADDLE_ENFORCE(context->HasInput("X"), "Input(X) of %s operator must not be null", comment.type); - auto dim_x = context->GetInputDim("X"); - context->SetOutputDim("Out", context->GetInputDim("X")); context->ShareLoD("X", "Out"); } diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 2d7d33bd4f..cfc2cac7be 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) { } out->mutable_data(out_dims, context.GetPlace()); auto x_stride = framework::stride(x->dims()); - auto out_stride = framework::stride(out->dims()); auto offsets = GetOffsets(context); int64_t offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index dd64cc327f..744d149714 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -378,7 +378,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { ->GetMutable(); auto input_dims = input->dims(); - auto weight_dims = weight->dims(); auto init_h_dims = init_h->dims(); auto init_c_dims = init_c->dims(); in_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 0b7c470fe7..fc223ce559 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -27,8 +27,8 @@ struct StridedMemcpyFunctor; template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<0> src_stride, framework::Dim<0> dst_dim, - framework::Dim<0> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); @@ -50,18 +50,18 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<1> src_stride, framework::Dim<1> dst_dim, - framework::Dim<1> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); - memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { #ifdef PADDLE_WITH_CUDA auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); - memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0], cuda_ctx.stream()); #else PADDLE_THROW("Paddle is not compiled with GPU"); @@ -73,19 +73,19 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim src_stride, framework::Dim dst_dim, - framework::Dim dst_stride, T* dst) const { - for (int64_t i = 0; i < dst_dim.head; ++i) { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { + for (int64_t i = 0; i < dst_dim[0]; ++i) { StridedMemcpyFunctor func; - func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); - src += src_stride.head; - dst += dst_stride.head; + func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst); + src += src_stride[0]; + dst += dst_stride[0]; } } }; template -struct StridedCopyDimVisitor : public boost::static_visitor { +struct StridedCopyDimVisitor { StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& src_stride, const framework::DDim& dst_stride, T* dst) @@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor { dst_stride_(dst_stride), dst_(dst) {} - template - void operator()(Dim dst_dim) const { - Dim src_stride = boost::get(src_stride_); - Dim dst_stride = boost::get(dst_stride_); - constexpr int dim = Dim::dimensions; - StridedMemcpyFunctor functor; - functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_); + template + void operator()(const framework::Dim& dst_dim) const { + StridedMemcpyFunctor functor; + functor(dev_ctx_, src_, src_stride_.data(), dst_dim.data(), + dst_stride_.data(), dst_); } const platform::DeviceContext& dev_ctx_; diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index fddd688401..a652d4d957 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null"); auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); - auto gt_classes_dims = ctx->GetInputDim("GtClasses"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); auto im_info_dims = ctx->GetInputDim("ImInfo"); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 709c2dfc4b..f1975a9a4b 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Variances"), "Input(Variances) shouldn't be null."); - auto scores_dims = ctx->GetInputDim("Scores"); - auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas"); - auto im_info_dims = ctx->GetInputDim("ImInfo"); - auto anchors_dims = ctx->GetInputDim("Anchors"); - auto variances_dims = ctx->GetInputDim("Variances"); - ctx->SetOutputDim("RpnRois", {-1, 4}); ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); } diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 46fff9d338..fd5d75ba52 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { auto anchor_dims = ctx->GetInputDim("Anchor"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, "The rank of Input(Anchor) must be 2."); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b15..775346c552 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), "Rank of first input must >= rank of second input."); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 75dbf1d8bf..3394082497 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel { auto& expand_times = context.Attr>("expand_times"); auto* out0 = context.Output("Out"); Eigen::DSizes bcast_dims; - auto x_dims = in0->dims(); for (size_t i = 0; i < expand_times.size(); ++i) { bcast_dims[i] = expand_times[i]; } diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index e80249fc87..7c53e5279d 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -148,7 +148,6 @@ class FCOpKernel : public framework::OpKernel { auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); auto output = ctx.Output("Out"); - auto in_dims = input->dims(); auto w_dims = w->dims(); auto out_dims = output->dims(); int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 1eb6523a2d..9344bfe65d 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -242,15 +242,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { bool is_reverse = ctx.Attr("is_reverse"); \ bool use_peepholes = ctx.Attr("use_peepholes"); -#define INIT_BASE_SIZES \ - auto ids_dims = ids->dims(); /* T x M*/ \ - auto ids_numel = ids->numel(); /* T x 1*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const int D3 = D * 3; \ - int64_t row_number = embeddings->dims()[0]; \ - int64_t row_width = embeddings->dims()[1]; \ +#define INIT_BASE_SIZES \ + auto ids_dims = ids->dims(); /* T x M*/ \ + auto ids_numel = framework::product(ids_dims); /* T x 1*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + int64_t row_number = embeddings->dims()[0]; \ + int64_t row_width = embeddings->dims()[1]; \ const int D4 = wh_dims[1]; #define INIT_BASE_INPUT_DATAS \ diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index 69e7fa4490..f458ce6c83 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel { "Input(Logits@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Logits"); - auto lab_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 9d248e0321..ef1fb83aa6 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel { "Output(Predicted@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h index 895a7019aa..d1127ce4a2 100644 --- a/paddle/fluid/operators/math/math_function_impl.h +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -37,9 +37,6 @@ void Transpose::operator()( for (int i = 0; i < Rank; i++) { permute[i] = axis[i]; } - auto in_dim = in.dims(); - auto out_dim = out->dims(); - auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); auto* dev = context.eigen_device(); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 9e99e44822..1d9d98b106 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -76,7 +76,6 @@ class SoftmaxFunctor> { void operator()(const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { auto in_dims = X->dims(); - auto out_dims = Y->dims(); const float* in_data = X->data(); float* out_data = Y->data(); const int kBatchDim = 0; diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc index 35db4c1ad1..9954e51083 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cc +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel { "Input(Out@Grad) must not be null."); auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); auto intermediate_dims = ctx->GetInputDim("IntermediateVal"); auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 8a111e6065..154b5f0d08 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -146,12 +146,6 @@ class MulGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) should not be null"); auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - auto x_mat_dims = framework::flatten_to_2d( - x_dims, ctx->Attrs().Get("x_num_col_dims")); - auto y_mat_dims = framework::flatten_to_2d( - y_dims, ctx->Attrs().Get("y_num_col_dims")); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821d..e58dccea13 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("Input"); auto label_dims = ctx->GetInputDim("Label"); - auto w_dims = ctx->GetInputDim("Weight"); PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; if (ctx->HasInput("Bias")) { diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index d0224177ec..6c95d3f3bf 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel { out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); - auto ndim = out_norm->dims(); T eps = static_cast(ctx.Attr("epsilon")); int axis = ctx.Attr("axis"); if (axis < 0) axis = xdim.size() + axis; diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h index 1a424728f7..5666613f6e 100644 --- a/paddle/fluid/operators/psroi_pool_op.h +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { int rois_num = rois->dims()[0]; auto in_stride = framework::stride(in_dims); - auto roi_stride = framework::stride(rois->dims()); auto out_stride = framework::stride(out->dims()); const T* input_data = in->data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h index 03b59d71cc..4bded0efb9 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h @@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { set_zero(ctx.template device_context(), x_grad, static_cast(0)); - auto out_grad_stride = framework::stride(out_grad->dims()); - for (size_t i = 0; i < out_lod[0].size() - 1; ++i) { Tensor out_grad_t = out_grad->Slice(static_cast(out_lod[0][i]), diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c3d83a06f2..6a99ad9a90 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& dst_stride, T* dst) { paddle::operators::detail::StridedCopyDimVisitor func( dev_ctx, src, src_stride, dst_stride, dst); - boost::apply_visitor(func, dst_dim); + dst_dim.apply_visitor(func); } // Strided numel memory copy from src to dst by the specified axis From 19ebd8b4cfffa2ba42c68fa4c761c54e857c6566 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:20:19 +0800 Subject: [PATCH 425/719] add ctc support for windows --- CMakeLists.txt | 4 ++-- cmake/external/warpctc.cmake | 30 ++++++++++++++++++++++----- cmake/operators.cmake | 2 +- paddle/fluid/operators/CMakeLists.txt | 4 +--- paddle/fluid/platform/port.h | 1 - python/paddle/fluid/__init__.py | 10 +++++++-- python/paddle/fluid/framework.py | 18 +++++++++++----- python/setup.py.in | 9 ++++---- 8 files changed, 55 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb646d3ce5..c31f51a3f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of warpctc, nccl, cupti in windows -include(external/warpctc) # download, build, install warpctc +# there is no official support of nccl, cupti in windows include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 07e1137e16..7b937c93fe 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" # Used in unit test test_WarpCTCLayer SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() SET(USE_OMP ON) ENDIF() +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" + GIT_REPOSITORY ${WARPCTC_REPOSITORY} PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} @@ -59,6 +67,18 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +IF(WIN32) + IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}") + add_custom_command(TARGET extern_warpctc POST_BUILD + COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + ENDIF() + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2ced43f9e6..70d159b4f3 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f..d9b0c66e57 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -64,9 +64,7 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index ad070171df..c1b81159ac 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) { static void *dlopen(const char *filename, int flag) { std::string file_name(filename); - file_name.replace(0, file_name.size() - 1, '/', '\\'); HMODULE hModule = LoadLibrary(file_name.c_str()); if (!hModule) { throw std::runtime_error(file_name + " not found."); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index b00510d443..8f3660ca38 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -102,6 +102,13 @@ def __bootstrap__(): import sys import os import platform + + if os.name == 'nt': + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core in_test = 'unittest' in sys.modules @@ -128,13 +135,12 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') if os.name != 'nt': - read_env_flags.append('warpctc_dir') read_env_flags.append('cpu_deterministic') if core.is_compiled_with_dist(): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d0bd78454d..b5d603d478 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -16,6 +16,7 @@ from __future__ import print_function import collections import contextlib +import os import re import six import sys @@ -27,11 +28,18 @@ from .proto import framework_pb2 try: from . import core except ImportError as e: - raise ImportError( - """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" - if you encounters \"libmkldnn.so not found\" errors. If you have python - installed in other directory, replace \"/usr/local/lib\" with your own - directory. The original error is: \n""" + cpt.get_exception_message(e)) + if os.name == 'nt': + raise ImportError( + """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\" + if you encounters \"mkldnn.dll not found\" errors. If you have python + installed in other directory, replace \"c:\python27\lib" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) + else: + raise ImportError( + """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" + if you encounters \"libmkldnn.so not found\" errors. If you have python + installed in other directory, replace \"/usr/local/lib\" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) except Exception as e: raise e from . import unique_name diff --git a/python/setup.py.in b/python/setup.py.in index cf8f28bd25..fefe8fbaa7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' -if os.name != 'nt': - package_data['paddle.libs']= [] - package_data['paddle.libs']=['libwarpctc' + ext_name] - shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + +package_data['paddle.libs']= [] +package_data['paddle.libs']=['libwarpctc' + ext_name] +shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) From ed5bd5e58639bfe8e584f4acdce2398701b12853 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:23:24 +0800 Subject: [PATCH 426/719] test=develop --- paddle/fluid/platform/dynload/CMakeLists.txt | 2 -- paddle/fluid/platform/dynload/cudnn.h | 2 +- paddle/fluid/platform/dynload/dynamic_loader.cc | 2 ++ paddle/fluid/platform/dynload/dynamic_loader.h | 6 ++++++ paddle/fluid/platform/dynload/mklml.h | 2 +- paddle/fluid/platform/dynload/tensorrt.h | 2 +- paddle/fluid/platform/dynload/warpctc.h | 2 +- 7 files changed, 12 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 5939c500c9..07159d4a12 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,9 +16,7 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) -if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) -endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 550fe2edee..2f4f8101e4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using cudnn_func = decltype(&::__name); \ std::call_once(cudnn_dso_flag, []() { \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index cc5cda6106..eddebfe92a 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -201,6 +201,8 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 84fd2ce998..edb4c649ad 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -18,6 +18,12 @@ namespace paddle { namespace platform { namespace dynload { +#ifndef _WIN32 +#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) +#else +#define DECLARE_TYPE(__name, ...) decltype(auto) +#endif + void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index c3f9433503..d0619293ac 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -34,7 +34,7 @@ extern void* mklml_dso_handle; #define DYNAMIC_LOAD_MKLML_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using mklmlFunc = decltype(&::__name); \ std::call_once(mklml_dso_flag, []() { \ mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 5d67658b94..751aa54b1a 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle; #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ std::call_once(tensorrt_dso_flag, []() { \ tensorrt_dso_handle = \ diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 18ed9956f1..bc1977b05d 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -34,7 +34,7 @@ extern void* warpctc_dso_handle; #define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using warpctcFunc = decltype(&::__name); \ std::call_once(warpctc_dso_flag, []() { \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ From b73d7d2f21a4010d10b1a2456e5991d77ed5e01e Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:27:14 +0800 Subject: [PATCH 427/719] test=develop --- python/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index fefe8fbaa7..22b9537a90 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -162,7 +162,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF': libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' package_data['paddle.libs']= [] -package_data['paddle.libs']=['libwarpctc' + ext_name] +package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': From 63240326027b4b7469c386d16c30273ef014c09a Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 13 Dec 2018 15:41:41 +0800 Subject: [PATCH 428/719] MLP forward backward test=develop --- paddle/fluid/imperative/layer.cc | 6 +- paddle/fluid/imperative/tracer.h | 25 ++++-- paddle/fluid/operators/mul_op.cc | 3 +- paddle/fluid/pybind/imperative.cc | 5 +- python/paddle/fluid/backward.py | 7 +- python/paddle/fluid/framework.py | 3 + python/paddle/fluid/imperative/base.py | 3 +- python/paddle/fluid/imperative/layers.py | 11 ++- python/paddle/fluid/layers/nn.py | 46 +++++++++++ .../fluid/tests/unittests/test_imperative.py | 79 ++++++++++++++++++- 10 files changed, 167 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 6125037680..342cb68ab2 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -188,11 +188,13 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { std::vector ret; for (size_t i = 0; i < input_vars_->size(); ++i) { bool found = false; + VarBase* origin_var = (*input_vars_)[i]; for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + if (origin_var->var_desc_->Name() != orig_var) { + continue; + } VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; origin_var->ApplyGrad(scope, var); found = true; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 433d07c0e5..97772dc110 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc, class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + explicit Tracer(framework::BlockDesc* root_block, + framework::BlockDesc* startup_block) + : root_block_(root_block), startup_block_(startup_block) { root_scope_ = new framework::Scope(); scopes_[root_block_] = root_scope_; + scopes_[startup_block_] = root_scope_; } virtual ~Tracer() { delete root_scope_; } @@ -80,6 +83,8 @@ class Tracer { } else { op->pre_ops_->push_back(nullptr); } + VLOG(3) << "input vname " << vname << " " + << var->Get().dims().size(); } *op->output_vars_ = outputs; @@ -98,12 +103,19 @@ class Tracer { outputs[i]->pre_op_ = op; outputs[i]->pre_op_out_idx_ = i; } + + VLOG(3) << "tracer running " << op_desc->Type(); op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; + if (block == startup_block_) { + op->grad_op_desc_ = nullptr; + op->grad_to_var_ = nullptr; + } else { + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + } op->block_ = block; } @@ -121,6 +133,7 @@ class Tracer { private: std::map scopes_; framework::BlockDesc* root_block_; + framework::BlockDesc* startup_block_; framework::Scope* root_scope_; }; diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 8a111e6065..271428408c 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT( y_dims.size(), y_num_col_dims, "The input tensor Y's rank of MulOp should be larger than " - "y_num_col_dims."); + "y_num_col_dims: %ld vs %ld", + y_dims.size(), y_num_col_dims); auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 34e9c897d9..be63fb8778 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -24,8 +24,9 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); + [](imperative::Tracer &self, framework::BlockDesc *root_block, + framework::BlockDesc *startup_block) { + new (&self) imperative::Tracer(root_block, startup_block); }) .def("trace", &imperative::Tracer::Trace) .def("get_scope", &imperative::Tracer::GetScope, diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index b2c3e7c989..6303be003a 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -489,8 +489,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, grad_to_var = dict() op_desc = _create_op_desc_( - "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, { - "shape": [1], + "fill_constant", + {}, + {"Out": [_append_grad_suffix_(loss.name)]}, + { + "shape": [1], # TODO(panyx0718): This can be loss.shape. "value": 1.0, "dtype": loss.dtype, "force_cpu": False, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d0bd78454d..bcffa9fe9c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1316,6 +1316,9 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], + [v._ivar for v in op.outputs], self.desc) self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index 15d38ddb56..aa48ef71aa 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -28,7 +28,8 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() - tracer = core.Tracer(train.current_block().desc) + tracer = core.Tracer(train.current_block().desc, + startup.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 1a28f7f4ae..044717c319 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -25,11 +25,9 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): def __init__(self): - pass + self._built = False def __call__(self, inputs): - # TODO(panyx0718): Support declarative mode as well. - assert base.enabled() if not isinstance(inputs, list) and not isinstance(inputs, tuple): inputs = [inputs] @@ -37,8 +35,15 @@ class PyLayer(core.Layer): for x in inputs: py_var = base.to_variable(x) var_inputs.append(py_var) + if not self._built: + self._build_once(inputs) + self._built = True + outputs = self.forward(var_inputs) return outputs + def _build_once(self, inputs): + pass + def forward(self, inputs): return [] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4d8311a0d3..771d41a5c1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -29,6 +29,7 @@ from . import utils from .. import unique_name from functools import reduce from .. import core +from ..imperative import layers __all__ = [ 'fc', @@ -9426,3 +9427,48 @@ def huber_loss(input, label, delta): 'Residual': residual}, attrs={'delta': delta}) return out + + +class FC(layers.PyLayer): + def __init__(self, + size, + param_attr=None, + num_flatten_dims=1, + dtype=core.VarDesc.VarType.FP32): + super(FC, self).__init__() + self._size = size + self._num_flatten_dims = num_flatten_dims + self._dtype = dtype + self._helper = LayerHelper('FC', param_attr=param_attr) + + def _build_once(self, inputs): + input_shape = inputs[0].shape + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) + ] + [self._size] + self._w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, inputs): + tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": inputs[0], + "Y": self._w}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="sum", + inputs={"X": [tmp]}, + outputs={"Out": out}, + attrs={"use_mkldnn": False}) + return out + diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index b5b6305155..0fe69d1bd4 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -12,12 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import unittest -import sys import numpy as np import paddle.fluid as fluid from paddle.fluid import core +from paddle.fluid.layers.nn import FC + + +@contextlib.contextmanager +def new_program_scope(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield class MyLayer(fluid.imperative.PyLayer): @@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer): return [fluid.layers.elementwise_mul(x, x)] +class MLP(fluid.imperative.PyLayer): + def __init__(self): + super(MLP, self).__init__() + self._fc1 = FC(3, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + self._fc2 = FC(4, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + x = self._fc1(inputs[0]) + x = self._fc2(x) + x = fluid.layers.reduce_sum(x) + return x + + class TestImperative(unittest.TestCase): def test_layer(self): with fluid.imperative.guard(): @@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase): l.forward([]) def test_layer_in_out(self): + np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): l = MyLayer() - x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + x = l(np_inp)[0] self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + dy_out = x._numpy() x._backward() - sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + dy_grad = l._x_for_debug._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[3], append_batch_size=False) + l = MyLayer() + x = l(inp)[0] + param_grads = fluid.backward.append_backward( + x, parameter_list=[l._x_for_debug.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[x.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) + + def test_mlp(self): + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + with fluid.imperative.guard(): + mlp = MLP() + out = mlp(np_inp) + dy_out = out._numpy() + out._backward() + dy_grad = mlp._fc1._w._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[2, 2], append_batch_size=False) + mlp = MLP() + out = mlp(inp) + param_grads = fluid.backward.append_backward( + out, parameter_list=[mlp._fc1._w.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[out.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) if __name__ == '__main__': From e0c3c56b0664ee92e5eb86dca810c029e5cd1d67 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 20:29:49 +0800 Subject: [PATCH 429/719] add nce remote ut, test=develop --- .../unittests/test_nce_remote_table_op.py | 68 ++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py index 5e440bf35d..b5f93f93a1 100644 --- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -27,6 +27,45 @@ from paddle.fluid.op import Operator from paddle.fluid.framework import Program, program_guard +def nce(input, weight, bias, sample_weight, labels, num_classes, + num_sample_class): + samples = [] + sample_labels = [] + batch_size = input.shape[0] + num_true_class = labels.shape[1] + for i in range(batch_size): + w = 1 if sample_weight is None else sample_weight[i] + for label in labels[i]: + samples.append((i, label, True, w)) + sample_labels.append(label) + for num in range(num_sample_class): + samples.append((i, num, False, w)) + sample_labels.append(num) + # forward bias + sample_out = np.zeros(len(samples)).astype(np.float32) + if bias is not None: + for i in range(len(samples)): + sample_out[i] = bias[samples[i][1]] + # forward weight + for i in range(len(samples)): + sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) + + # forward activation + sample_out = 1.0 / (1.0 + np.exp(-sample_out)) + # forward cost + out = np.zeros(batch_size).astype(np.float32) + b = 1.0 / num_classes * num_sample_class + + for i in range(len(samples)): + o = sample_out[i] + cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) + out[samples[i][0]] += cost * samples[i][3] + return (out[:, np.newaxis], np.array(sample_out).reshape( + batch_size, num_sample_class + num_true_class), + np.array(sample_labels).reshape(batch_size, + num_sample_class + num_true_class)) + + def run_pserver(pserver_id, use_cuda, sync_mode): scope = fluid.core.Scope() program = Program() @@ -94,11 +133,11 @@ class TestListenAndServOp(unittest.TestCase): with fluid.scope_guard(scope): with program_guard(program, startup_program=Program()): x = scope.var('Input').get_tensor() - x_array = np.random.random((4, 8)).astype("float32") * 2 + x_array = np.random.random((4, 8)).astype("float32") x.set(x_array, place) # create and initialize Param Variable param = scope.var('Weight').get_tensor() - param_array = np.zeros((5, 8)).astype("float32") * 2 + param_array = np.zeros((5, 8)).astype("float32") param.set(param_array, place) bias = scope.var('Bias').get_tensor() @@ -110,7 +149,7 @@ class TestListenAndServOp(unittest.TestCase): sample_w.set(sample_weight, place) label = scope.var('Label').get_tensor() - label_array = np.array([0, 1, 4, 5]) + label_array = np.array([[0], [1], [4], [3]]) label.set(label_array, place) cost = scope.var('Cost').get_tensor() @@ -122,7 +161,7 @@ class TestListenAndServOp(unittest.TestCase): sample_l.set(sample_l_w, place) sample_la = scope.var('SampleLabels').get_tensor() - sample_la_w = np.zeros((4, 3)).astype("float32") + sample_la_w = np.zeros((4, 3)).astype("int") sample_la.set(sample_la_w, place) emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] @@ -139,11 +178,12 @@ class TestListenAndServOp(unittest.TestCase): Cost='Cost', SampleLogits='SampleLogits', SampleLabels='SampleLabels', + SampleWeight='SampleWeight', num_total_classes=5, num_neg_samples=2, custom_neg_classes=list(range(2)), sampler=0, - seed=1, + seed=0, is_sparse=True, remote_prefetch=True, epmap=emaps, @@ -153,9 +193,21 @@ class TestListenAndServOp(unittest.TestCase): nce_op.run(scope, place) # get and compare result - o_cost = np.array(cost_w) - o_logits = np.array(sample_l) - o_labels = np.array(sample_la) + o_cost = np.array(scope.var('Cost').get_tensor()) + o_logits = np.array(scope.var('SampleLogits').get_tensor()) + o_labels = np.array(scope.var('SampleLabels').get_tensor()) + + param_array = np.ones((5, 8)).astype("float32") + for i in range(2): + param_array[i] *= param_array[i] * i + 0 * 10 + 1 + for i in range(2, 5): + param_array[i] *= param_array[i] * i + 1 * 10 + 1 + out = nce(x_array, param_array, bias_array, sample_weight, + label_array, 5, 2) + + self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6) + self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6) + self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6) def test_nce_op_remote(self): os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" From 3f9c429ea0b41292db0b80f4a150b62d2e78b192 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 20:31:23 +0800 Subject: [PATCH 430/719] fix lint test=develop --- python/paddle/fluid/layers/nn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 771d41a5c1..d8bc919784 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9471,4 +9471,3 @@ class FC(layers.PyLayer): outputs={"Out": out}, attrs={"use_mkldnn": False}) return out - From b2f789c66dc847d9fbc030a2db218be670e7752f Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 12:47:58 +0000 Subject: [PATCH 431/719] add test transpiler dist test, test=develop --- .../tests/unittests/test_dist_transpiler.py | 43 +++++++++++++++---- .../fluid/transpiler/distribute_transpiler.py | 2 +- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 27575897b5..f572d69277 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -879,29 +879,36 @@ class TestRemoteNce(TestDistLookupTableBase): class TestRemoteHsigmoid(TestDistLookupTableBase): def network_with_table(self, is_sparse, is_distributed): - num_total_classes = 10 + num_total_classes = 3 - input = fluid.layers.data(name="input", shape=[10], dtype="float32") + input = fluid.layers.data(name="input", shape=[1], dtype="float32") label = fluid.layers.data(name="label", shape=[1], dtype="int64") path_table = fluid.layers.data( - name='path_table', shape=[10], dtype='int64') + name='path_table', shape=[3], dtype='int64') path_code = fluid.layers.data( - name='path_code', shape=[10], dtype='int64') + name='path_code', shape=[3], dtype='int64') w_param = fluid.default_main_program().global_block().create_parameter( shape=[num_total_classes, 10], dtype='float32', name='hs_w', initializer=fluid.initializer.ConstantInitializer()) b_param = fluid.default_main_program().global_block().create_parameter( - shape=[num_total_classes, 1], + shape=[3, 1], dtype='float32', name='hs_b', initializer=fluid.initializer.ConstantInitializer()) - cost = fluid.layers.hsigmoid( + emb = fluid.layers.embedding( input=input, + is_sparse=is_sparse, + size=[3, 3], + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1 / math.sqrt(num_total_classes)))) + + cost = fluid.layers.hsigmoid( + input=emb, label=label, - num_classes=non_leaf_num, + num_classes=num_total_classes, path_table=path_table, path_code=path_code, is_custom=True, @@ -918,9 +925,29 @@ class TestRemoteHsigmoid(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() + params_to_check = list() for op in trainer.blocks[0].ops: - if op.type == "recv": + if op.type == "hierarchical_sigmoid": + params_to_check = [op.input("W")[0], op.input("Bias")[0]] + for name in ["epmap", "table_names", "epmap"]: + assert op.has_attr(name) + if name == "epmap": + assert op.attr(name)[0] == u'127.0.0.1:6174' + elif name == "table_names": + assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0' + else: + assert op.attr(name) == 3 + elif op.type == "lookup_table": + params_to_check.append(op.input("W")[0]) + else: pass + op_count = 0 + for op in trainer.blocks[0].ops: + if op.type == "recv": + assert len(op.output("Out")) == 1 + assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0' + op_count += 1 + assert op_count == 1 if __name__ == "__main__": diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 378654ab5b..f5ca3dffb7 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -242,7 +242,7 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table", "nce"] + sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( 'remote_prefetch') is True: From 19a8d965858173789376248b076fc0339422d313 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 18 Dec 2018 13:18:11 +0000 Subject: [PATCH 432/719] fix nce in test_dist_transpiler, test=develop --- python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 73795a2154..0555db4cba 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -871,8 +871,8 @@ class TestRemoteNce(TestDistLookupTableBase): def transpiler_test_impl(self): trainer, _ = self.get_trainer() - out_vars = ["nce_w.block0", "nce_w.block1"] - in_vars = ["nce_b.block0", "nce_b.block1"] + out_vars = ["nce_w"] + in_vars = ["nce_b"] recv_var_names = [] From f7fb937bfe64a1017f0b4c87706e6655764c775d Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 18 Dec 2018 21:29:47 +0800 Subject: [PATCH 433/719] fix in cmake, test=develop --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8..950029ed94 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -21,6 +21,8 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) + LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op) + LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op) endif(NOT WITH_DISTRIBUTE) if (NOT ${WITH_GPU}) @@ -32,7 +34,6 @@ endif() list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 From 6648995f53d69ae1a1b9f0cf4b9fef000fadd54b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 17 Dec 2018 15:32:55 +0000 Subject: [PATCH 434/719] fix build --- paddle/fluid/operators/crf_decoding_op.h | 4 +- .../elementwise/elementwise_mul_mkldnn_op.cc | 2 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 46 ++++++++-------- .../fluid/operators/fused/fusion_lstm_op.cc | 55 +++++++++---------- paddle/fluid/operators/jit/helper.h | 8 +-- paddle/fluid/operators/layer_norm_op.h | 2 +- paddle/fluid/operators/math/fc_compute.h | 5 +- 7 files changed, 61 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 860d71e1fe..9b90ba749b 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,8 +82,8 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::Get( - tag_num); + auto ker = jit::Get, + platform::CPUPlace>(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc index 71f4b71330..2ade5818a9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc @@ -108,7 +108,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = jit::Get, platform::CPUPlace>(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index d44a7ad83e..3f27ccb2ef 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -183,29 +183,29 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = \ - jit::Get(attr); \ - auto ComputeHtPart1 = \ - jit::Get(attr); \ - auto ComputeHtPart2 = \ - jit::Get(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::Get, platform::CPUPlace>(attr); \ + auto ComputeHtPart1 = \ + jit::Get, platform::CPUPlace>(attr); \ + auto ComputeHtPart2 = \ + jit::Get, platform::CPUPlace>(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index a62f4d18c2..a3f021ed9d 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -236,33 +236,32 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit \ - : lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - math::jitkernel::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = \ - jit::Get(attr); \ - auto ComputeCtHt = \ - jit::Get(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::Get, platform::CPUPlace>(attr); \ + auto ComputeCtHt = \ + jit::Get, platform::CPUPlace>(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ @@ -434,7 +433,7 @@ class FuisonLSTMKernel : public framework::OpKernel { one_step.ct_1 = cur_prev_c_data; one_step.ct = cur_c_out_data; one_step.ht = cur_h_out_data; - ComputeC1H1(&one_step, &attr); + ComputeCtHt(&one_step, &attr); // move one batch cur_in_data += D4; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 275170ca2b..38bc7cd8e8 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -32,7 +32,7 @@ inline typename std::enable_if< std::is_same::value && std::is_same::value, typename KernelTuples::func_type>::type -GetJitCode(typename KernelTuples::attr_type attr) { +GetJitCode(const typename KernelTuples::attr_type& attr) { using Func = typename KernelTuples::func_type; using Attr = typename KernelTuples::attr_type; size_t key = JitCodeKey(attr); @@ -68,7 +68,7 @@ inline typename std::enable_if< !std::is_same::value || !std::is_same::value, typename KernelTuples::func_type>::type -GetJitCode(typename KernelTuples::attr_type attr) { +GetJitCode(const typename KernelTuples::attr_type& attr) { return nullptr; } @@ -93,8 +93,8 @@ inline typename KernelTuples::func_type GetRefer() { template -// TODO(TJ): const & attr -typename KernelTuples::func_type Get(typename KernelTuples::attr_type attr) { +typename KernelTuples::func_type Get( + const typename KernelTuples::attr_type& attr) { auto jitfunc = GetJitCode(attr); if (jitfunc) { return jitfunc; diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index bb00ed4729..0651dbf6a9 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -230,7 +230,7 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(bias->numel(), right); auto ker = - jit::Get( + jit::Get, platform::CPUPlace>( right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 5e3093c69d..bdb0d8511c 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -31,13 +31,14 @@ inline void FCCompute(const BlasT& blas, const int M, } if (relu) { auto compute = - jit::Get(N); + jit::Get, platform::CPUPlace>(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = jit::Get(N); + auto compute = + jit::Get, platform::CPUPlace>(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif From 10c340c9a39e3ca54268cb4d7134bef514ff689c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 18 Dec 2018 14:40:34 +0000 Subject: [PATCH 435/719] fix confilcts --- paddle/fluid/operators/jit/gen/act.cc | 41 ++++++++++++++------------- paddle/fluid/operators/jit/gen/act.h | 1 - 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index f3332cbefa..391cf57d8a 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -21,26 +21,27 @@ namespace operators { namespace jit { namespace gen { -const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f), - REPEAT_8TIMES(2.f), - REPEAT_8TIMES(0.5f), - REPEAT_8TIMES(EXP_HIG), - REPEAT_8TIMES(EXP_LOW), - REPEAT_8TIMES(CEPHES_LOG2EF), - REPEAT_8TIMES(CEPHES_EXP_C1), - REPEAT_8TIMES(CEPHES_EXP_C2), - REPEAT_8TIMES(CEPHES_EXP_P0), - REPEAT_8TIMES(CEPHES_EXP_P1), - REPEAT_8TIMES(CEPHES_EXP_P2), - REPEAT_8TIMES(CEPHES_EXP_P3), - REPEAT_8TIMES(CEPHES_EXP_P4), - REPEAT_8TIMES(CEPHES_EXP_P5), - REPEAT_8TIMES(EXP_MAX_INPUT), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; - -const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; -int g_tmp_mem[16] ALIGN32 = {0}; +const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = { + REPEAT_8TIMES(1.f), + REPEAT_8TIMES(2.f), + REPEAT_8TIMES(0.5f), + REPEAT_8TIMES(EXP_HIG), + REPEAT_8TIMES(EXP_LOW), + REPEAT_8TIMES(CEPHES_LOG2EF), + REPEAT_8TIMES(CEPHES_EXP_C1), + REPEAT_8TIMES(CEPHES_EXP_C2), + REPEAT_8TIMES(CEPHES_EXP_P0), + REPEAT_8TIMES(CEPHES_EXP_P1), + REPEAT_8TIMES(CEPHES_EXP_P2), + REPEAT_8TIMES(CEPHES_EXP_P3), + REPEAT_8TIMES(CEPHES_EXP_P4), + REPEAT_8TIMES(CEPHES_EXP_P5), + REPEAT_8TIMES(EXP_MAX_INPUT), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; + +const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)}; +int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0}; void VActJitCode::genCode() { int offset = 0; diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h index 63dee7bc0d..c35579c3ad 100644 --- a/paddle/fluid/operators/jit/gen/act.h +++ b/paddle/fluid/operators/jit/gen/act.h @@ -27,7 +27,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -#define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 From 4cc7707d281931d254a377e29c5f9fe37a6a993a Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 18 Dec 2018 14:05:19 +0000 Subject: [PATCH 436/719] add crf_decoding and layer norm intrisic code --- .../fluid/operators/jit/more/CMakeLists.txt | 4 + .../jit/more/intrinsic/CMakeLists.txt | 9 ++ .../jit/more/intrinsic/crf_decoding.cc | 138 ++++++++++++++++++ .../jit/more/intrinsic/crf_decoding.h | 89 +++++++++++ paddle/fluid/operators/jit/more/more.h | 15 -- 5 files changed, 240 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc create mode 100644 paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h delete mode 100644 paddle/fluid/operators/jit/more/more.h diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt index 5bb78b9304..a740d1a840 100644 --- a/paddle/fluid/operators/jit/more/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/CMakeLists.txt @@ -7,4 +7,8 @@ if(WITH_MKLML) add_subdirectory(mkl) endif() +if(WITH_AVX) + add_subdirectory(intrinsic) +endif() + set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt new file mode 100644 index 0000000000..c4a5013863 --- /dev/null +++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt @@ -0,0 +1,9 @@ + +file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") +cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base) + +set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) + +# use mkl kernels by name and type +USE_JITKERNEL_MORE(crfdecoding, intrinsic) +USE_JITKERNEL_MORE(layernorm, intrinsic) diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc new file mode 100644 index 0000000000..016fca3868 --- /dev/null +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h" +#include "paddle/fluid/operators/jit/refer/refer.h" +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace more { +namespace mkl { + +template <> +void VMul(const float* x, const float* y, float* z, int n) { + platform::dynload::vsMul(n, x, y, z); +} + +template <> +void VMul(const double* x, const double* y, double* z, int n) { + platform::dynload::vdMul(n, x, y, z); +} + +template <> +void VAdd(const float* x, const float* y, float* z, int n) { + platform::dynload::vsAdd(n, x, y, z); +} + +template <> +void VAdd(const double* x, const double* y, double* z, int n) { + platform::dynload::vdAdd(n, x, y, z); +} + +template <> +void VScal(const float* a, const float* x, float* y, int n) { + if (x == y) { + platform::dynload::cblas_sscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +template <> +void VScal(const double* a, const double* x, double* y, int n) { + if (x == y) { + platform::dynload::cblas_dscal(n, *a, y, 1); + } else { + refer::VScal(a, x, y, n); + } +} + +template <> +void VExp(const float* x, float* y, int n) { + platform::dynload::vsExp(n, x, y); +} + +template <> +void VExp(const double* x, double* y, int n) { + platform::dynload::vdExp(n, x, y); +} + +// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 +template <> +bool VMulKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VAddKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VScalKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + +template <> +bool VExpKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VSigmoidKernel::UseMe(int d) const { + return d > 7; +} + +template <> +bool VTanhKernel::UseMe(int d) const { + return d > 7; +} + +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::UseMe(int d) const { \ + return true; \ + } + +AWALYS_USE_ME_WITH_DOUBLE(VMul); +AWALYS_USE_ME_WITH_DOUBLE(VAdd); +AWALYS_USE_ME_WITH_DOUBLE(VScal); +AWALYS_USE_ME_WITH_DOUBLE(VExp); +AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); +AWALYS_USE_ME_WITH_DOUBLE(VTanh); + +#undef AWALYS_USE_ME_WITH_DOUBLE +} // namespace mkl +} // namespace more +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace mkl = paddle::operators::jit::more::mkl; + +#define REGISTER_MKL_KERNEL(key, func) \ + REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ + mkl::func##Kernel) + +REGISTER_MKL_KERNEL(vmul, VMul); +REGISTER_MKL_KERNEL(vadd, VAdd); +REGISTER_MKL_KERNEL(vscal, VScal); +REGISTER_MKL_KERNEL(vexp, VExp); +REGISTER_MKL_KERNEL(vsigmoid, VSigmoid); +REGISTER_MKL_KERNEL(vtanh, VTanh); + +#undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h new file mode 100644 index 0000000000..bf209d2f9d --- /dev/null +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/jit/kernel_base.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace more { +namespace mkl { + +template +void VMul(const T* x, const T* y, T* z, int n); + +template +void VAdd(const T* x, const T* y, T* z, int n); + +template +void VScal(const T* a, const T* x, T* y, int n); + +template +void VExp(const T* x, T* y, int n); + +template +void VSigmoid(const T* x, T* y, int n) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + VExp(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } +} + +template +void VTanh(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + +#define DECLARE_MKL_KERNEL(name, tuples) \ + template \ + class name##Kernel : public KernelImpl> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool UseMe(typename tuples::attr_type) const override; \ + } + +// XYZN +DECLARE_MKL_KERNEL(VMul, XYZNTuples); +DECLARE_MKL_KERNEL(VAdd, XYZNTuples); + +// AXYN +DECLARE_MKL_KERNEL(VScal, AXYNTuples); + +// XYN +DECLARE_MKL_KERNEL(VExp, XYNTuples); +DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); +DECLARE_MKL_KERNEL(VTanh, XYNTuples); + +#undef DECLARE_MKL_KERNEL + +} // namespace mkl +} // namespace more +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/more/more.h b/paddle/fluid/operators/jit/more/more.h deleted file mode 100644 index ab99fdc05f..0000000000 --- a/paddle/fluid/operators/jit/more/more.h +++ /dev/null @@ -1,15 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once From b1516783ea98c9770761be59b1de515a5bbfc521 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 18 Dec 2018 15:29:25 +0000 Subject: [PATCH 437/719] enable crf decoding intrinsic code --- paddle/fluid/operators/jit/README.md | 4 + .../jit/more/intrinsic/CMakeLists.txt | 1 - .../jit/more/intrinsic/crf_decoding.cc | 241 ++++++++++-------- .../jit/more/intrinsic/crf_decoding.h | 68 +---- 4 files changed, 150 insertions(+), 164 deletions(-) diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index ce31f18b63..1264bc96ee 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -19,6 +19,10 @@ PaddlePaddle/Paddle/paddle/fluid/ │ ├── ... │ ├── mkl/ │ │ └── ... + │ ├── mkldnn/ + │ │ └── ... + │ ├── intrinsic/ + │ │ └── ... │ └── openblas/ │ └── ... └── refer/ diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt index c4a5013863..de83d80e77 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt @@ -6,4 +6,3 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) # use mkl kernels by name and type USE_JITKERNEL_MORE(crfdecoding, intrinsic) -USE_JITKERNEL_MORE(layernorm, intrinsic) diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 016fca3868..17b5eaf13d 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -13,7 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h" -#include "paddle/fluid/operators/jit/refer/refer.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -21,118 +21,151 @@ namespace paddle { namespace operators { namespace jit { namespace more { -namespace mkl { - -template <> -void VMul(const float* x, const float* y, float* z, int n) { - platform::dynload::vsMul(n, x, y, z); -} - -template <> -void VMul(const double* x, const double* y, double* z, int n) { - platform::dynload::vdMul(n, x, y, z); -} - -template <> -void VAdd(const float* x, const float* y, float* z, int n) { - platform::dynload::vsAdd(n, x, y, z); -} - -template <> -void VAdd(const double* x, const double* y, double* z, int n) { - platform::dynload::vdAdd(n, x, y, z); -} - -template <> -void VScal(const float* a, const float* x, float* y, int n) { - if (x == y) { - platform::dynload::cblas_sscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); +namespace intrinsic { + +void CRFDecoding(const int seq_len, const float* x, const float* w, + float* alpha, int* track, int tag_num) { + const int step_size = + platform::MayIUse(platform::avx512f) ? ZMM_FLOAT_BLOCK : YMM_FLOAT_BLOCK; + const int end = tag_num / step_size; + const int rest = tag_num % step_size; + /* Setup the alpha initial value.*/ + int i_offset = 0; + int last_offset = rest - step_size; + for (int i = 0; i <= end; ++i) { +#ifdef __AVX512F__ + // Declare the variable for the content of weights, input and alpha values. + __m512 w_content, x_content, alpha_content; + // Load the relevant data into the variables from un-aligned address. + w_content = _mm512_loadu_ps(w + i_offset); + x_content = _mm512_loadu_ps(x + i_offset); + alpha_content = _mm512_add_ps(w_content, x_content); + // Save the alpha value. + _mm512_storeu_ps(alpha_value + i_offset, alpha_content); +#else + // AVX or AVX2 + // weights, input and alpha values. + __m256 w_content, x_content, alpha_content; + // Load the relevant data into the variables from un-aligned address. + w_content = _mm256_loadu_ps(w + i_offset); + x_content = _mm256_loadu_ps(x + i_offset); + alpha_content = _mm256_add_ps(w_content, x_content); + _mm256_storeu_ps(alpha + i_offset, alpha_content); +#endif + i_offset += step_size; + if (i == end - 1) { + if (rest > 0) { + i_offset += last_offset; + } else { + break; + } + } } -} - -template <> -void VScal(const double* a, const double* x, double* y, int n) { - if (x == y) { - platform::dynload::cblas_dscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); + // Use the column-major strategy to get the location of maximum score. + int seq_offset = 0; + constexpr int state_trans_base_idx = 2; + for (int k = 1; k < seq_len; ++k) { + int j_offset = 0; + for (int j = 0; j <= end; ++j) { +/* Initialize the variables of maximum score and location.*/ +#ifdef __AVX512F__ + __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); + __m512i max_j = _mm512_setzero_si512(); +#else + __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); + __m256i max_j = _mm256_set1_epi32(0); +#endif + /* Calculate the offset of transition_weights.*/ + int trans_offset = state_trans_base_idx * tag_num + j_offset; + for (int i = 0; i < tag_num; ++i) { +/* Initalize the content of alpha variable with related offset.*/ +#ifdef __AVX512F__ + __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i)); + /* Obtain the content of weights from un-aligned address.*/ + __m512 w_content = _mm512_loadu_ps(w + trans_offset); + __m512 score_v = _mm512_add_ps(alpha_content, w_content); + __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); + /* AVX512 instructions.*/ + max_j = _mm512_mask_set1_epi32(max_j, mask, i); + /* Update the max_score value.*/ + max_score = _mm512_max_ps(max_score, score_v); + +#else + __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); + /* Obtain the content of weights from un-aligned address.*/ + __m256 w_content = _mm256_loadu_ps(w + trans_offset); + __m256 score_v = _mm256_add_ps(alpha_content, w_content); + __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); +/* According to the mask value, update the index of the max_score.*/ +#ifdef __AVX2__ + max_j = _mm256_or_si256( + _mm256_andnot_si256((__m256i)mask, max_j), + _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); +#else + __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); + __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); + __m128i lo_mask = + _mm256_extractf128_si256(*(__m256i*)&mask, 0); // NOLINT + __m128i hi_mask = + _mm256_extractf128_si256(*(__m256i*)&mask, 1); // NOLINT + lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); + hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); + lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); + hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); + lo_max_j = _mm_or_si128(lo_mask, lo_max_j); + hi_max_j = _mm_or_si128(hi_mask, hi_max_j); + max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); + max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); +#endif + /* Update the max_score value.*/ + max_score = _mm256_max_ps(max_score, score_v); + +#endif + + trans_offset += tag_num; + } +/* Update the alpha and track values. */ +#ifdef __AVX512F__ + __m512 x_content = + _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset); + max_score = _mm512_add_ps(max_score, x_content); + _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset + + this->num_ + j_offset), + max_j); +#else + __m256 x_content = _mm256_loadu_ps(x + seq_offset + tag_num + j_offset); + max_score = _mm256_add_ps(max_score, x_content); + _mm256_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset), + max_j); +#endif + + /* Calculate the offset of next step*/ + j_offset += step_size; + if (j == end - 1) { + if (rest > 0) { + j_offset += last_offset; + } else { + break; + } + } + } + seq_offset += tag_num; } } -template <> -void VExp(const float* x, float* y, int n) { - platform::dynload::vsExp(n, x, y); -} - -template <> -void VExp(const double* x, double* y, int n) { - platform::dynload::vdExp(n, x, y); -} - -// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 -template <> -bool VMulKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; +bool CRFDecodingKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx); } -template <> -bool VAddKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; -} - -template <> -bool VScalKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; -} - -template <> -bool VExpKernel::UseMe(int d) const { - return d > 7; -} - -template <> -bool VSigmoidKernel::UseMe(int d) const { - return d > 7; -} - -template <> -bool VTanhKernel::UseMe(int d) const { - return d > 7; -} - -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::UseMe(int d) const { \ - return true; \ - } - -AWALYS_USE_ME_WITH_DOUBLE(VMul); -AWALYS_USE_ME_WITH_DOUBLE(VAdd); -AWALYS_USE_ME_WITH_DOUBLE(VScal); -AWALYS_USE_ME_WITH_DOUBLE(VExp); -AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); -AWALYS_USE_ME_WITH_DOUBLE(VTanh); - -#undef AWALYS_USE_ME_WITH_DOUBLE -} // namespace mkl +} // namespace intrinsic } // namespace more } // namespace jit } // namespace operators } // namespace paddle -namespace mkl = paddle::operators::jit::more::mkl; - -#define REGISTER_MKL_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ - mkl::func##Kernel) - -REGISTER_MKL_KERNEL(vmul, VMul); -REGISTER_MKL_KERNEL(vadd, VAdd); -REGISTER_MKL_KERNEL(vscal, VScal); -REGISTER_MKL_KERNEL(vexp, VExp); -REGISTER_MKL_KERNEL(vsigmoid, VSigmoid); -REGISTER_MKL_KERNEL(vtanh, VTanh); +namespace intrinsic = paddle::operators::jit::more::intrinsic; -#undef REGISTER_MKL_KERNEL +REGISTER_JITKERNEL_MORE(crfdecoding, intrinsic, intrinsic::CRFDecodingKernel); diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index bf209d2f9d..a4081cfc34 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -21,68 +21,18 @@ namespace paddle { namespace operators { namespace jit { namespace more { -namespace mkl { +namespace intrinsic { -template -void VMul(const T* x, const T* y, T* z, int n); +void CRFDecoding(const int seq_len, const float* x, const float* w, + float* alpha, int* track, int tag_num); -template -void VAdd(const T* x, const T* y, T* z, int n); +class CRFDecodingKernel : public KernelImpl> { + public: + CRFDecodingKernel() { this->func = CRFDecoding; } + bool UseMe(typename CRFDecodingTuples::attr_type) const override; +}; -template -void VScal(const T* a, const T* x, T* y, int n); - -template -void VExp(const T* x, T* y, int n); - -template -void VSigmoid(const T* x, T* y, int n) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - VExp(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -template -void VTanh(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoid(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - -#define DECLARE_MKL_KERNEL(name, tuples) \ - template \ - class name##Kernel : public KernelImpl> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(typename tuples::attr_type) const override; \ - } - -// XYZN -DECLARE_MKL_KERNEL(VMul, XYZNTuples); -DECLARE_MKL_KERNEL(VAdd, XYZNTuples); - -// AXYN -DECLARE_MKL_KERNEL(VScal, AXYNTuples); - -// XYN -DECLARE_MKL_KERNEL(VExp, XYNTuples); -DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); -DECLARE_MKL_KERNEL(VTanh, XYNTuples); - -#undef DECLARE_MKL_KERNEL - -} // namespace mkl +} // namespace intrinsic } // namespace more } // namespace jit } // namespace operators From 62eb43ba98931f303127441b0f53f142b12f439f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 20:22:56 +0800 Subject: [PATCH 438/719] convert more test=develop --- paddle/fluid/framework/operator.cc | 35 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8c83748668..5bee6b41bd 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -142,12 +142,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, const Scope& scope) { for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; + input_vars.reserve(var_name_item.second.size()); for (auto& var_name : var_name_item.second) { input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; + output_vars.reserve(var_name_item.second.size()); for (auto& var_name : var_name_item.second) { output_vars.push_back(scope.FindVar(var_name)); } @@ -556,30 +558,28 @@ class RuntimeInferShapeContext : public InferShapeContext { bool HasOutput(const std::string& name) const override { // has only one output - const auto& outs = op_.Outputs(); + const auto& outs = ctx_.outputs; auto it = outs.find(name); if (it == outs.end()) { return false; } const auto& out = it->second; - if (out.size() == 0 || out[0] == kEmptyVarName) { + if (out.size() == 0) { return false; } PADDLE_ENFORCE_EQ(out.size(), 1UL, "Output %s should not have more than one outputs", name); - return scope_.FindVar(out[0]) != nullptr; + return out[0] != nullptr; } bool HasInputs(const std::string& name) const override { - if (!op_.HasInputs(name)) { - return false; - } - auto inputs = op_.Inputs(name); - if (inputs.empty()) { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end()) { return false; } - for (auto& input : inputs) { - if (scope_.FindVar(input) == nullptr) { + for (auto& input : it->second) { + if (input == nullptr) { return false; } } @@ -587,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext { } bool HasOutputs(const std::string& name) const override { - if (!op_.HasOutputs(name)) { - return false; - } - auto outputs = op_.Outputs(name); - if (outputs.empty()) { + const auto& outs = ctx_.outputs; + auto it = outs.find(name); + if (it == outs.end()) { return false; } - for (auto& output : outputs) { - if (scope_.FindVar(output) == nullptr) { + for (auto& output : it->second) { + if (output == nullptr) { return false; } } @@ -864,8 +862,7 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; - auto* var = scope.FindVar(var_name); - input_vars[i] = var; + auto* var = input_vars[i]; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { From 0e0983cc1d9a607ba8a339bbbe9e495e304cd11f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 21:27:04 +0800 Subject: [PATCH 439/719] convert more infer shape --- paddle/fluid/framework/operator.cc | 34 ++++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 5bee6b41bd..a7bee3344d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -614,16 +614,19 @@ class RuntimeInferShapeContext : public InferShapeContext { void ShareDim(const std::string& in, const std::string& out, size_t i = 0, size_t j = 0) override { - PADDLE_ENFORCE_LT(i, Inputs(in).size()); - PADDLE_ENFORCE_LT(j, Outputs(out).size()); - const std::string& input_n = Inputs(in)[i]; - const std::string& output_n = Outputs(out)[j]; + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second[i]; + Variable* out_var = out_it->second[j]; - Variable* in_var = scope_.FindVar(input_n); - Variable* out_var = scope_.FindVar(output_n); PADDLE_ENFORCE(in_var->Type() == out_var->Type(), - "The type of %s and %s is not the same.", output_n, - GetDim(input_n)); + "The type of %s and %s is not the same.", in_var->Type(), + out_var->Type()); if (in_var->IsType()) { auto& in_sele_rows = in_var->Get(); @@ -644,13 +647,16 @@ class RuntimeInferShapeContext : public InferShapeContext { void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, size_t j = 0) const override { - const std::vector& inputs = Inputs(in); - const std::vector& outputs = Outputs(out); - PADDLE_ENFORCE_LT(i, inputs.size()); - PADDLE_ENFORCE_LT(j, outputs.size()); - Variable* in_var = scope_.FindVar(inputs.at(i)); + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second.at(i); if (!in_var->IsType()) return; - Variable* out_var = scope_.FindVar(outputs.at(j)); + Variable* out_var = out_it->second.at(j); PADDLE_ENFORCE(out_var->IsType(), "The %d-th output of Output(%s) must be LoDTensor.", j, out); auto in_tensor = in_var->Get(); From 52d3903a1208747c2e3c97b90bb0f48e08f7a85b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 19 Dec 2018 10:03:36 +0800 Subject: [PATCH 440/719] fix test=develop --- paddle/fluid/framework/operator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a7bee3344d..e023d165b0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -575,7 +575,7 @@ class RuntimeInferShapeContext : public InferShapeContext { bool HasInputs(const std::string& name) const override { const auto& ins = ctx_.inputs; auto it = ins.find(name); - if (it == ins.end()) { + if (it == ins.end() || it->second.empty()) { return false; } for (auto& input : it->second) { @@ -589,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext { bool HasOutputs(const std::string& name) const override { const auto& outs = ctx_.outputs; auto it = outs.find(name); - if (it == outs.end()) { + if (it == outs.end() || it->second.empty()) { return false; } for (auto& output : it->second) { From 2f3b5054ad9a4fb0f62450c6dca912e0c1306471 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 19 Dec 2018 10:31:40 +0800 Subject: [PATCH 441/719] fix build script --- python/setup.py.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py.in b/python/setup.py.in index bfbaa1d015..521d108b2c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -156,8 +156,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' +package_data['paddle.libs']= [] if os.name != 'nt': - package_data['paddle.libs']= [] package_data['paddle.libs']=['libwarpctc' + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) @@ -169,7 +169,7 @@ else: if os.name == 'nt': # copy the openblas.dll shutil.copy(os.path.dirname('${CBLAS_LIBRARIES}') + '/openblas' + ext_name, libs_path) - package_data['paddle.fluid'] += ['openblas' + ext_name] + package_data['paddle.libs'] += ['openblas' + ext_name] if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release': From aa6e9c30becf0215870fd3633684c97a6d614263 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 19 Dec 2018 03:54:05 +0100 Subject: [PATCH 442/719] [MKL-DNN ]Added transpose/transpose2 Op (#14872) * - Added transpose MKLDNN Op - Few basic UT works - Added 1D transpose - implementing generic mem desc for MKLDNN transpose - Modified trnaspose op to support more dimensional data eg. 5,6..10 - Added is_test attribute to transpose op test=develop * - Added support for MKLDNN::memory::format::any for Transpose MKLDNN op test=develop * - Additional transpose mkldnn op correction to mkldnn layout test=develop * Cosmetic fixes test=develop * - Removed const_cast to obey coding standard test=develop --- paddle/fluid/operators/transpose_mkldnn_op.cc | 124 ++++++++++++++++++ paddle/fluid/operators/transpose_op.cc | 49 ++++++- .../unittests/test_transpose_mkldnn_op.py | 76 +++++++++++ .../tests/unittests/test_transpose_op.py | 13 +- 4 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/transpose_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc new file mode 100644 index 0000000000..37f1cadc7d --- /dev/null +++ b/paddle/fluid/operators/transpose_mkldnn_op.cc @@ -0,0 +1,124 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using framework::DataLayout; + +template +class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + const bool is_test = ctx.Attr("is_test"); + PADDLE_ENFORCE( + is_test == true, + "ConvTransposeMKLDNN works only for inference!. Set is_test = True"); + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + std::vector axis = ctx.Attr>("axis"); + int ndims = axis.size(); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + const T* input_data = input->data(); + + if (ndims == 1) { + output->ShareDataWith(*input); + return; + } + + std::vector nchw_axis(ndims, 0); + for (size_t i = 0; i < nchw_axis.size(); ++i) { + nchw_axis[i] = i; + } + + std::vector nchw_tz = paddle::framework::vectorize2int(input->dims()); + std::string data_format = ctx.Attr("data_format"); + + auto src_md = + input->format() != mkldnn::memory::format::nchw + ? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType(), + input->format()) + : Axis2MemoryDesc(nchw_tz, nchw_axis); + + this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis), + src_md, output, input_data, nchw_tz, mkldnn_engine); + } + + protected: + mkldnn::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, + std::vector& axis) const { + mkldnn_memory_desc_t mem_fmt; + + mem_fmt.primitive_kind = mkldnn_memory; + mem_fmt.ndims = axis.size(); + for (unsigned int i = 0; i < nchw_tz.size(); ++i) { + mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format, + // regardless physical layout) + } + mem_fmt.data_type = mkldnn_f32; + mem_fmt.format = mkldnn_blocked; + + unsigned int total_stride = 1; + for (int i = nchw_tz.size() - 1; i >= 0; --i) { + mem_fmt.layout_desc.blocking.padding_dims[i] = + nchw_tz[i]; // logical dimensions (nchw format, regardless physical + // layout) + mem_fmt.layout_desc.blocking.block_dims[i] = 1; + mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset + mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride; + mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1; + total_stride *= nchw_tz[axis[i]]; + } + mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset + return mem_fmt; + } + + void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o, + mkldnn::memory::desc md_i, Tensor* output, + const T* data_i, std::vector& nchw_dims, + const mkldnn::engine& eng) const { + // Make Memory primitive descriptors + auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng); + auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng); + + auto data_o = output->mutable_data( + place, paddle::memory::Allocator::kDefault, mpd_o.get_size()); + + auto src = mkldnn::memory(mpd_i, (T*)(data_i)); + auto dst = mkldnn::memory(mpd_o, data_o); + + auto r = mkldnn::reorder(src, dst); + mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, + ops::TransposeMKLDNNOpKernel); +REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, + ops::TransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index bc1f59bc1a..b3b379d16f 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -16,6 +16,10 @@ limitations under the License. */ #include #include +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace operators { @@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel { } ctx->SetOutputDim("Out", out_dims); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout_, library_); + } }; class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddInput( "X", "(Tensor) The input tensor, tensors with rank up to 6 are supported."); @@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { "(vector) A list of values, and the size of the list should be " "the same with the input tensor rank. This operator permutes the input " "tensor's axes according to the values given."); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Transpose Operator. @@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout_, library_); } }; diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py new file mode 100644 index 0000000000..61ac879011 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py @@ -0,0 +1,76 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +from test_transpose_op import TestTransposeOp + + +class TestTransposeMKLDNN(TestTransposeOp): + def init_op_type(self): + self.op_type = "transpose2" + self.use_mkldnn = True + self.is_test = True + return + + def test_check_grad(self): + return + + def test_check_grad_no_input(self): + return + + def test_check_grad_no_filter(self): + return + + +class TestCase0MKLDNN(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, ) + self.axis = (0, ) + + +class TestCase1a(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, 4, 5) + self.axis = (0, 2, 1) + + +class TestCase1b(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, 4, 5) + self.axis = (2, 1, 0) + + +class TestCase2(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + + +class TestCase3(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) + + +class TestCase4(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index bbcabb751f..93be9d28da 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -21,15 +21,24 @@ from op_test import OpTest class TestTransposeOp(OpTest): def setUp(self): + self.init_op_type() self.initTestCase() - self.op_type = "transpose2" self.inputs = {'X': np.random.random(self.shape).astype("float32")} - self.attrs = {'axis': list(self.axis)} + self.attrs = { + 'axis': list(self.axis), + 'use_mkldnn': self.use_mkldnn, + 'is_test': self.is_test, + } self.outputs = { 'XShape': np.random.random(self.shape).astype("float32"), 'Out': self.inputs['X'].transpose(self.axis) } + def init_op_type(self): + self.op_type = "transpose2" + self.use_mkldnn = False + self.is_test = False + def test_check_output(self): self.check_output(no_check_set=['XShape']) From 4dd61e7260314faa4b9b8f5a4c5406af013d919e Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 19 Dec 2018 11:07:16 +0800 Subject: [PATCH 443/719] convert GetInputVarPtrs and GetOutputVarPtrs test=develop --- paddle/fluid/framework/op_desc.cc | 31 ++++++++++++++----- paddle/fluid/framework/operator.cc | 36 +++++++++++++++++++++-- paddle/fluid/framework/shape_inference.cc | 22 -------------- paddle/fluid/framework/shape_inference.h | 31 ++++++++++--------- 4 files changed, 74 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index dde642764f..0a3bb586fc 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -110,6 +110,30 @@ class CompileTimeInferShapeContext : public InferShapeContext { } } + std::vector GetInputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Inputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Outputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + bool IsRuntime() const override; protected: @@ -124,8 +148,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { void SetRepeatedDims(const std::string &name, const std::vector &dims) override; - InferShapeVarPtr GetVarPtr(const std::string &name) override; - const OpDesc &op_; const BlockDesc &block_; }; @@ -696,10 +718,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType( return block_.FindVarRecursive(name)->GetType(); } -InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr( - const std::string &name) { - return block_.FindVarRecursive(name); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e023d165b0..4ccef3105c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -691,6 +691,25 @@ class RuntimeInferShapeContext : public InferShapeContext { bool IsRuntime() const override { return true; } + // TODO(paddle-dev): Can this be template? + std::vector GetInputVarPtrs( + const std::string& name) override { + const std::vector& vars = InputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string& name) override { + const std::vector& vars = OutputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + protected: DDim GetDim(const std::string& name) const override { Variable* var = scope_.FindVar(name); @@ -733,11 +752,22 @@ class RuntimeInferShapeContext : public InferShapeContext { return ToVarType(var->Type()); } - InferShapeVarPtr GetVarPtr(const std::string& name) override { - return scope_.FindVar(name); + private: + const std::vector& InputVars(const std::string& name) const { + auto it = ctx_.inputs.find(name); + PADDLE_ENFORCE(it != ctx_.inputs.end(), + "Operator %s does not have the input %s.", op_.Type(), name); + return it->second; + } + + const std::vector& OutputVars(const std::string& name) const { + auto it = ctx_.outputs.find(name); + PADDLE_ENFORCE(it != ctx_.outputs.end(), + "Operator %s does not have the outputs %s.", op_.Type(), + name); + return it->second; } - private: const OperatorBase& op_; const Scope& scope_; const RuntimeContext& ctx_; diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc index ddff2c7c26..0a7cebcc5a 100644 --- a/paddle/fluid/framework/shape_inference.cc +++ b/paddle/fluid/framework/shape_inference.cc @@ -76,28 +76,6 @@ void InferShapeContext::SetReaderDims(const std::string &name, return this->SetRepeatedDims(arg_names[0], dims); } -std::vector InferShapeContext::GetInputVarPtrs( - const std::string &name) { - const std::vector arg_names = Inputs(name); - std::vector res; - res.reserve(arg_names.size()); - std::transform( - arg_names.begin(), arg_names.end(), std::back_inserter(res), - [this](const std::string &name) { return this->GetVarPtr(name); }); - return res; -} - -std::vector InferShapeContext::GetOutputVarPtrs( - const std::string &name) { - const std::vector arg_names = Outputs(name); - std::vector res; - res.reserve(arg_names.size()); - std::transform( - arg_names.begin(), arg_names.end(), std::back_inserter(res), - [this](const std::string &name) { return this->GetVarPtr(name); }); - return res; -} - std::vector InferShapeContext::GetDims( const std::vector &names) const { std::vector ret; diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index d73cca121e..543696d43b 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -33,22 +33,24 @@ class InferShapeContext { virtual bool HasInput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0; - std::vector GetInputsVarType( + virtual std::vector GetInputsVarType( const std::string &name) const; - std::vector GetOutputsVarType( + virtual std::vector GetOutputsVarType( const std::string &name) const; virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; - DDim GetInputDim(const std::string &name) const; - std::vector GetInputsDim(const std::string &name) const; - std::vector GetReaderDims(const std::string &name) const; - DDim GetInputsElementDim(const std::string &name, int idx) const; + virtual DDim GetInputDim(const std::string &name) const; + virtual std::vector GetInputsDim(const std::string &name) const; + virtual std::vector GetReaderDims(const std::string &name) const; + virtual DDim GetInputsElementDim(const std::string &name, int idx) const; - void SetOutputDim(const std::string &name, const DDim &dim); - void SetOutputsDim(const std::string &name, const std::vector &dims); - void SetReaderDims(const std::string &name, const std::vector &dims); + virtual void SetOutputDim(const std::string &name, const DDim &dim); + virtual void SetOutputsDim(const std::string &name, + const std::vector &dims); + virtual void SetReaderDims(const std::string &name, + const std::vector &dims); virtual AttrReader Attrs() const = 0; virtual const std::vector &Inputs( @@ -67,13 +69,14 @@ class InferShapeContext { virtual bool IsRuntime() const = 0; - std::vector GetInputVarPtrs(const std::string &name); - std::vector GetOutputVarPtrs(const std::string &name); - virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0; + virtual std::vector GetInputVarPtrs( + const std::string &name) = 0; + virtual std::vector GetOutputVarPtrs( + const std::string &name) = 0; // Note: In while op, we need this to be public - void SetDims(const std::vector &names, - const std::vector &dims); + virtual void SetDims(const std::vector &names, + const std::vector &dims); protected: virtual DDim GetDim(const std::string &name) const = 0; From b849157e9d3584a8d4b891340706c181c542deb0 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 19 Dec 2018 11:44:48 +0800 Subject: [PATCH 444/719] Add size enforce (#14919) --- .../distributed/brpc_sendrecvop_utils.cc | 23 ++++++++++++++----- .../fluid/operators/distributed/grpc_serde.cc | 8 +++++++ .../operators/distributed/sendrecvop_utils.h | 9 ++++++-- .../distributed/variable_response.cc | 2 +- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc index 6fed9ba92c..e4604db3a3 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #endif #include +#include #include // NOLINT #include "paddle/fluid/framework/data_type.h" @@ -31,7 +32,12 @@ namespace distributed { class IOBufWriter { public: - static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { + static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, + const char* v, int64_t vlen) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + iobuf->append(reinterpret_cast(&k), 4); iobuf->append(reinterpret_cast(&vlen), 8); iobuf->append(v, vlen); @@ -87,6 +93,10 @@ class IOBufWriter { int k, const char* v, int64_t vlen, bool in_cuda_pinned, void (*destroy)(void*), void* user_data) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + #ifdef PADDLE_WITH_BRPC_RDMA IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, destroy, user_data); @@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, request->set_type(::sendrecv::NCCL_ID); const ncclUniqueId& uid = var->Get(); // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(iobuf, + IOBufWriter::Append(name, iobuf, sendrecv::VariableMessage::kSerializedFieldNumber, uid.internal, NCCL_UNIQUE_ID_BYTES); return; @@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, // FIXME(gongwb): it seems that can use zero copy. if (var_is_not_stable) { IOBufWriter::Append( - iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, static_cast(payload->ptr()), payload->memory_size()); } else { if (platform::is_gpu_place(ctx.GetPlace())) { @@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, + IOBufWriter::Append(name, iobuf, + ::sendrecv::VariableMessage::kRowsFieldNumber, reinterpret_cast(slr->rows().data()), static_cast(rows_memory_size)); } diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 299dfe3543..a9dea9cfd2 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload->memory_size()); + if (payload->memory_size() >= std::numeric_limits::max()) { + LOG(FATAL) << "AppendZeroCopy varname:" << name + << ", vlen:" << payload->memory_size(); + } // steal reference of tensor data ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows int num_slices = 2; // only SelectedRows have rows buffer @@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); ProtoEncodeHelper e2(static_cast(buf), 128); + + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 33eded0e6c..6a87178be5 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include #include #include "paddle/fluid/framework/data_type.h" @@ -23,9 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/port.h" - #include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType( } } +template