From 52bc4ee75adf64e449dfdbbdbbe3e41cdc593bdc Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 20:27:17 +0800 Subject: [PATCH 01/13] delay infer scope test=develop --- paddle/fluid/framework/operator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a62afe248b..86e1713b02 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -703,8 +703,6 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -758,6 +756,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope); + this->InferShape(&infer_shape_ctx); kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); if (!transfered_inplace_vars.empty()) { From bbff0df320f0f68634a5ae3c4d9507b52a1134f7 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 16 Dec 2018 21:49:25 +0800 Subject: [PATCH 02/13] try cache variables test=develop --- paddle/fluid/framework/ngraph_operator.cc | 15 +++++++- paddle/fluid/framework/operator.cc | 47 ++++++++++++++++------- paddle/fluid/framework/operator.h | 22 ++++++++--- paddle/fluid/framework/type_defs.h | 3 ++ 4 files changed, 66 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e2cdfc845f..e37f0915c5 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,7 +278,20 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - op->RuntimeInferShape(scope_, place_); + RuntimeContext ctx; + for (auto& var_name_item : op->Inputs()) { + std::vector input_vars = ctx.inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope_.FindVar(var_name)); + } + } + for (auto& var_name_item : op->Outputs()) { + std::vector output_vars = ctx.outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope_.FindVar(var_name)); + } + } + op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope_.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 86e1713b02..79e3d29a63 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -477,23 +477,22 @@ bool OpSupportGPU(const std::string& op_type) { class RuntimeInferShapeContext : public InferShapeContext { public: - RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) - : op_(op), scope_(scope) {} + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, + const RuntimeContext& ctx) + : op_(op), scope_(scope), ctx_(ctx) {} bool HasInput(const std::string& name) const override { // has only one input - const auto& ins = op_.Inputs(); + const auto& ins = ctx_.inputs; auto it = ins.find(name); if (it == ins.end()) { return false; } const auto& in = it->second; - if (in.size() == 0 || in[0] == kEmptyVarName) { - return false; - } + if (in.size() == 0) return false; PADDLE_ENFORCE_EQ(in.size(), 1UL, "Input %s should not have more than one inputs", name); - return scope_.FindVar(in[0]) != nullptr; + return in[0] != nullptr; } bool HasOutput(const std::string& name) const override { @@ -678,6 +677,7 @@ class RuntimeInferShapeContext : public InferShapeContext { private: const OperatorBase& op_; const Scope& scope_; + const RuntimeContext& ctx_; }; static void CheckTensorNANOrInf(const std::string& name, @@ -696,8 +696,9 @@ static void CheckTensorNANOrInf(const std::string& name, } void OperatorWithKernel::RuntimeInferShape(const Scope& scope, - const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); + const platform::Place& place, + const RuntimeContext& ctx) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); this->InferShape(&infer_shape_ctx); } @@ -743,10 +744,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } + RuntimeContext ctx; // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = @@ -756,7 +758,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope); + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); @@ -797,13 +799,20 @@ void OperatorWithKernel::TransferInplaceVarsBack( } } -Scope* OperatorWithKernel::TryTransferData( +Scope* OperatorWithKernel::PrepareData( const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const { + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const { Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { - for (auto& var_name : var_name_item.second) { + std::vector& input_vars = ctx->inputs[var_name_item.first]; + input_vars.resize(var_name_item.second.size()); + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); + input_vars[i] = var; + // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { continue; @@ -851,12 +860,22 @@ Scope* OperatorWithKernel::TryTransferData( } auto* trans_var = new_scope->Var(var_name); + input_vars[i] = var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); SetTensorToVariable(*var, out, trans_var); } } + for (auto& var_name_item : Outputs()) { + std::vector& output_vars = ctx->outputs[var_name_item.first]; + output_vars.resize(var_name_item.second.size()); + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; + output_vars[i] = scope.FindVar(var_name); + } + } return new_scope; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bc..438ae25398 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -70,6 +70,14 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); class OperatorBase; class ExecutionContext; +class RuntimeContext { + public: + RuntimeContext() {} + + VariableValueMap inputs; + VariableValueMap outputs; +}; + /** * OperatorBase has the basic elements that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -129,7 +137,8 @@ class OperatorBase { void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } virtual void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const {} + const platform::Place& place, + const RuntimeContext& ctx) const {} protected: std::string type_; @@ -350,8 +359,8 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } - void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const override; + void RuntimeInferShape(const Scope& scope, const platform::Place& place, + const RuntimeContext& ctx) const override; protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; @@ -371,9 +380,10 @@ class OperatorWithKernel : public OperatorBase { * * * transfered_inplace_vars is a output vector. */ - Scope* TryTransferData( - const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const; + Scope* PrepareData(const Scope& scope, + const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const; void TransferInplaceVarsBack(const Scope& scope, const std::vector& inplace_vars, diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 2de6233a9e..938e2024c3 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -28,8 +28,11 @@ class OperatorBase; class OpDesc; class InferShapeContext; class BlockDesc; +class Variable; using VariableNameMap = std::map>; +// TODO(panyx0718): Replace vector with something like gtl::Vector. +using VariableValueMap = std::map>; // The order should be as same as framework.proto using Attribute = From 840e6729e224d867386bdfc9ff12af4b71ee7188 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 17 Dec 2018 21:27:56 +0800 Subject: [PATCH 03/13] inject context test=develop --- paddle/fluid/framework/ngraph_operator.cc | 14 +------- paddle/fluid/framework/operator.cc | 36 +++++++++++-------- paddle/fluid/framework/operator.h | 9 +++-- .../fluid/operators/beam_search_decode_op.cc | 3 +- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index e37f0915c5..23f681ce88 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,19 +278,7 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - RuntimeContext ctx; - for (auto& var_name_item : op->Inputs()) { - std::vector input_vars = ctx.inputs[var_name_item.first]; - for (auto& var_name : var_name_item.second) { - input_vars.push_back(scope_.FindVar(var_name)); - } - } - for (auto& var_name_item : op->Outputs()) { - std::vector output_vars = ctx.outputs[var_name_item.first]; - for (auto& var_name : var_name_item.second) { - output_vars.push_back(scope_.FindVar(var_name)); - } - } + RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 79e3d29a63..461d357527 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } } +RuntimeContext::RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) { + for (auto& var_name_item : innames) { + std::vector& input_vars = inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope.FindVar(var_name)); + } + } + for (auto& var_name_item : outnames) { + std::vector& output_vars = outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope.FindVar(var_name)); + } + } +} + void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(4) << place << " " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { @@ -704,6 +721,7 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { + RuntimeContext ctx(Inputs(), Outputs(), scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -717,15 +735,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. - - // for (auto& candidate : kKernelPriority) { - // Do selection - // } - - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + auto expected_kernel_key = this->GetExpectedKernelType( + ExecutionContext(*this, scope, *dev_ctx, ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -744,7 +755,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } - RuntimeContext ctx; // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = @@ -760,7 +770,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. @@ -784,6 +794,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } + void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { @@ -806,7 +817,6 @@ Scope* OperatorWithKernel::PrepareData( Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { std::vector& input_vars = ctx->inputs[var_name_item.first]; - input_vars.resize(var_name_item.second.size()); for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; @@ -869,8 +879,6 @@ Scope* OperatorWithKernel::PrepareData( } for (auto& var_name_item : Outputs()) { std::vector& output_vars = ctx->outputs[var_name_item.first]; - output_vars.resize(var_name_item.second.size()); - for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 438ae25398..e359414d15 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -72,7 +72,8 @@ class ExecutionContext; class RuntimeContext { public: - RuntimeContext() {} + RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, const Scope& scope); VariableValueMap inputs; VariableValueMap outputs; @@ -165,8 +166,9 @@ class OperatorBase { class ExecutionContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, - const platform::DeviceContext& device_context) - : op_(op), scope_(scope), device_context_(device_context) {} + const platform::DeviceContext& device_context, + const RuntimeContext& ctx) + : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {} const OperatorBase& op() const { return op_; } @@ -295,6 +297,7 @@ class ExecutionContext { const OperatorBase& op_; const Scope& scope_; const platform::DeviceContext& device_context_; + const RuntimeContext& ctx_; }; template <> diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index ae9765b761..7f2bde55c9 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(dev_place); - framework::ExecutionContext ctx(*this, scope, dev_ctx); + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); const LoDTensorArray* ids = ctx.Input("Ids"); const LoDTensorArray* scores = ctx.Input("Scores"); From eaf8ba35b519b780629a7108d08ffd3895ac18fe Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 09:42:57 +0800 Subject: [PATCH 04/13] change input test=develop --- paddle/fluid/framework/operator.cc | 50 ++++++++++++++++++++++++++++++ paddle/fluid/framework/operator.h | 33 +++++++++++++++----- paddle/fluid/operators/prelu_op.cc | 2 +- 3 files changed, 76 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 461d357527..87f61f3afc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -143,12 +143,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { + LOG(ERROR) << "first in " << var_name_item.first << ":" << var_name; input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { + LOG(ERROR) << "first out " << var_name_item.first << ":" << var_name; output_vars.push_back(scope.FindVar(var_name)); } } @@ -429,11 +431,52 @@ bool ExecutionContext::HasOutput(const std::string& name) const { return var != nullptr; } +const Variable* ExecutionContext::InputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::OutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + +const Variable* ExecutionContext::FastInputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::FastOutputVar(const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } +template <> +const Tensor* ExecutionContext::FastInput( + const std::string& name) const { + return FastInput(name); +} + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const { @@ -458,6 +501,11 @@ Tensor* ExecutionContext::Output(const std::string& name) const { return Output(name); } +template <> +Tensor* ExecutionContext::FastOutput(const std::string& name) const { + return FastOutput(name); +} + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { @@ -822,6 +870,7 @@ Scope* OperatorWithKernel::PrepareData( auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); input_vars[i] = var; + LOG(ERROR) << "second in " << var_name_item.first << ":" << var_name; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -882,6 +931,7 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); + LOG(ERROR) << "second out " << var_name_item.first << ":" << var_name; } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e359414d15..0aad91dbee 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -191,15 +191,9 @@ class ExecutionContext { return op_.Outputs(name).size(); } - const Variable* InputVar(const std::string& name) const { - auto ipt = op_.Input(name); - return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - } + const Variable* InputVar(const std::string& name) const; - Variable* OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); - } + Variable* OutputVar(const std::string& name) const; const std::vector MultiInputVar( const std::string& name) const { @@ -238,6 +232,22 @@ class ExecutionContext { return var == nullptr ? nullptr : var->GetMutable(); } + template + const T* FastInput(const std::string& name) const { + auto* var = FastInputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* FastOutput(const std::string& name) const { + auto var = FastOutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + const Variable* FastInputVar(const std::string& name) const; + + Variable* FastOutputVar(const std::string& name) const; + template const std::vector MultiInput(const std::string& name) const { auto names = op_.Inputs(name); @@ -303,6 +313,10 @@ class ExecutionContext { template <> const Tensor* ExecutionContext::Input(const std::string& name) const; +template <> +const Tensor* ExecutionContext::FastInput( + const std::string& name) const; + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const; @@ -310,6 +324,9 @@ const std::vector ExecutionContext::MultiInput( template <> Tensor* ExecutionContext::Output(const std::string& name) const; +template <> +Tensor* ExecutionContext::FastOutput(const std::string& name) const; + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 62c55c4f55..b6155ed3dd 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,7 +56,7 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), + return framework::OpKernelType(ctx.FastInput("X")->type(), ctx.device_context()); } }; From fb8ae30331f42b6b9ef67c80e0ccb3fffcbf9836 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 12:35:45 +0800 Subject: [PATCH 05/13] fix test=develop --- paddle/fluid/framework/operator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 87f61f3afc..807667e684 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -919,7 +919,7 @@ Scope* OperatorWithKernel::PrepareData( } auto* trans_var = new_scope->Var(var_name); - input_vars[i] = var; + input_vars[i] = trans_var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); From 70981f5d799b5ab1593743b6ec88af6c40698a3b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 15:30:23 +0800 Subject: [PATCH 06/13] clean test=develop --- paddle/fluid/framework/operator.cc | 36 ++++++++++++------------------ paddle/fluid/framework/operator.h | 16 ++++++------- paddle/fluid/operators/prelu_op.cc | 2 +- 3 files changed, 23 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 807667e684..7d5a6198a0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -143,14 +143,12 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { - LOG(ERROR) << "first in " << var_name_item.first << ":" << var_name; input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; for (auto& var_name : var_name_item.second) { - LOG(ERROR) << "first out " << var_name_item.first << ":" << var_name; output_vars.push_back(scope.FindVar(var_name)); } } @@ -441,22 +439,13 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } -Variable* ExecutionContext::OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); -} - -const Variable* ExecutionContext::FastInputVar(const std::string& name) const { - auto it = ctx_.inputs.find(name); - if (it == ctx_.inputs.end()) return nullptr; - - PADDLE_ENFORCE_LE(it->second.size(), 1UL, - "Operator %s's input %s should contain only one variable.", - op_.Type(), name); - return it->second.empty() ? nullptr : it->second[0]; +const Variable* ExecutionContext::LegacyInputVar( + const std::string& name) const { + auto ipt = op_.Input(name); + return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); } -Variable* ExecutionContext::FastOutputVar(const std::string& name) const { +Variable* ExecutionContext::OutputVar(const std::string& name) const { auto it = ctx_.outputs.find(name); if (it == ctx_.outputs.end()) return nullptr; @@ -466,15 +455,20 @@ Variable* ExecutionContext::FastOutputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } +Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } template <> -const Tensor* ExecutionContext::FastInput( +const Tensor* ExecutionContext::LegacyInput( const std::string& name) const { - return FastInput(name); + return LegacyInput(name); } template <> @@ -502,8 +496,8 @@ Tensor* ExecutionContext::Output(const std::string& name) const { } template <> -Tensor* ExecutionContext::FastOutput(const std::string& name) const { - return FastOutput(name); +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { + return LegacyOutput(name); } template <> @@ -870,7 +864,6 @@ Scope* OperatorWithKernel::PrepareData( auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); input_vars[i] = var; - LOG(ERROR) << "second in " << var_name_item.first << ":" << var_name; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -931,7 +924,6 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; output_vars[i] = scope.FindVar(var_name); - LOG(ERROR) << "second out " << var_name_item.first << ":" << var_name; } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0aad91dbee..39190d07b4 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -233,20 +233,20 @@ class ExecutionContext { } template - const T* FastInput(const std::string& name) const { - auto* var = FastInputVar(name); + const T* LegacyInput(const std::string& name) const { + auto* var = LegacyInputVar(name); return var == nullptr ? nullptr : &var->Get(); } template - T* FastOutput(const std::string& name) const { - auto var = FastOutputVar(name); + T* LegacyOutput(const std::string& name) const { + auto var = LegacyOutputVar(name); return var == nullptr ? nullptr : var->GetMutable(); } - const Variable* FastInputVar(const std::string& name) const; + const Variable* LegacyInputVar(const std::string& name) const; - Variable* FastOutputVar(const std::string& name) const; + Variable* LegacyOutputVar(const std::string& name) const; template const std::vector MultiInput(const std::string& name) const { @@ -314,7 +314,7 @@ template <> const Tensor* ExecutionContext::Input(const std::string& name) const; template <> -const Tensor* ExecutionContext::FastInput( +const Tensor* ExecutionContext::LegacyInput( const std::string& name) const; template <> @@ -325,7 +325,7 @@ template <> Tensor* ExecutionContext::Output(const std::string& name) const; template <> -Tensor* ExecutionContext::FastOutput(const std::string& name) const; +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const; template <> std::vector ExecutionContext::MultiOutput( diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index b6155ed3dd..62c55c4f55 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -56,7 +56,7 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.FastInput("X")->type(), + return framework::OpKernelType(ctx.Input("X")->type(), ctx.device_context()); } }; From f897bd16c0e4deb683075e137e7bfe5890488205 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 18 Dec 2018 15:40:23 +0800 Subject: [PATCH 07/13] clean test=develop --- paddle/fluid/framework/operator.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7d5a6198a0..8c83748668 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -812,6 +812,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); this->InferShape(&infer_shape_ctx); + // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext + // not Scope. Imperative mode only pass inputs and get outputs. kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { @@ -919,13 +921,6 @@ Scope* OperatorWithKernel::PrepareData( SetTensorToVariable(*var, out, trans_var); } } - for (auto& var_name_item : Outputs()) { - std::vector& output_vars = ctx->outputs[var_name_item.first]; - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto& var_name = var_name_item.second[i]; - output_vars[i] = scope.FindVar(var_name); - } - } return new_scope; } From 19ebd8b4cfffa2ba42c68fa4c761c54e857c6566 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:20:19 +0800 Subject: [PATCH 08/13] add ctc support for windows --- CMakeLists.txt | 4 ++-- cmake/external/warpctc.cmake | 30 ++++++++++++++++++++++----- cmake/operators.cmake | 2 +- paddle/fluid/operators/CMakeLists.txt | 4 +--- paddle/fluid/platform/port.h | 1 - python/paddle/fluid/__init__.py | 10 +++++++-- python/paddle/fluid/framework.py | 18 +++++++++++----- python/setup.py.in | 9 ++++---- 8 files changed, 55 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb646d3ce5..c31f51a3f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of warpctc, nccl, cupti in windows -include(external/warpctc) # download, build, install warpctc +# there is no official support of nccl, cupti in windows include(cupti) include(external/gzstream) endif (NOT WIN32) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 07e1137e16..7b937c93fe 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" # Used in unit test test_WarpCTCLayer SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() SET(USE_OMP ON) ENDIF() +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" + GIT_REPOSITORY ${WARPCTC_REPOSITORY} PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} @@ -59,6 +67,18 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +IF(WIN32) + IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}") + add_custom_command(TARGET extern_warpctc POST_BUILD + COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + ENDIF() + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2ced43f9e6..70d159b4f3 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f..d9b0c66e57 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -64,9 +64,7 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index ad070171df..c1b81159ac 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) { static void *dlopen(const char *filename, int flag) { std::string file_name(filename); - file_name.replace(0, file_name.size() - 1, '/', '\\'); HMODULE hModule = LoadLibrary(file_name.c_str()); if (!hModule) { throw std::runtime_error(file_name + " not found."); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index b00510d443..8f3660ca38 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -102,6 +102,13 @@ def __bootstrap__(): import sys import os import platform + + if os.name == 'nt': + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core in_test = 'unittest' in sys.modules @@ -128,13 +135,12 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') if os.name != 'nt': - read_env_flags.append('warpctc_dir') read_env_flags.append('cpu_deterministic') if core.is_compiled_with_dist(): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d0bd78454d..b5d603d478 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -16,6 +16,7 @@ from __future__ import print_function import collections import contextlib +import os import re import six import sys @@ -27,11 +28,18 @@ from .proto import framework_pb2 try: from . import core except ImportError as e: - raise ImportError( - """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" - if you encounters \"libmkldnn.so not found\" errors. If you have python - installed in other directory, replace \"/usr/local/lib\" with your own - directory. The original error is: \n""" + cpt.get_exception_message(e)) + if os.name == 'nt': + raise ImportError( + """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\" + if you encounters \"mkldnn.dll not found\" errors. If you have python + installed in other directory, replace \"c:\python27\lib" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) + else: + raise ImportError( + """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" + if you encounters \"libmkldnn.so not found\" errors. If you have python + installed in other directory, replace \"/usr/local/lib\" with your own + directory. The original error is: \n""" + cpt.get_exception_message(e)) except Exception as e: raise e from . import unique_name diff --git a/python/setup.py.in b/python/setup.py.in index cf8f28bd25..fefe8fbaa7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF': # put all thirdparty libraries in paddle.libs libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' -if os.name != 'nt': - package_data['paddle.libs']= [] - package_data['paddle.libs']=['libwarpctc' + ext_name] - shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + +package_data['paddle.libs']= [] +package_data['paddle.libs']=['libwarpctc' + ext_name] +shutil.copy('${WARPCTC_LIBRARIES}', libs_path) + if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) From ed5bd5e58639bfe8e584f4acdce2398701b12853 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:23:24 +0800 Subject: [PATCH 09/13] test=develop --- paddle/fluid/platform/dynload/CMakeLists.txt | 2 -- paddle/fluid/platform/dynload/cudnn.h | 2 +- paddle/fluid/platform/dynload/dynamic_loader.cc | 2 ++ paddle/fluid/platform/dynload/dynamic_loader.h | 6 ++++++ paddle/fluid/platform/dynload/mklml.h | 2 +- paddle/fluid/platform/dynload/tensorrt.h | 2 +- paddle/fluid/platform/dynload/warpctc.h | 2 +- 7 files changed, 12 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 5939c500c9..07159d4a12 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -16,9 +16,7 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) -if (NOT WIN32) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) -endif(NOT WIN32) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 550fe2edee..2f4f8101e4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using cudnn_func = decltype(&::__name); \ std::call_once(cudnn_dso_flag, []() { \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index cc5cda6106..eddebfe92a 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -201,6 +201,8 @@ void* GetCurandDsoHandle() { void* GetWarpCTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so"); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 84fd2ce998..edb4c649ad 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -18,6 +18,12 @@ namespace paddle { namespace platform { namespace dynload { +#ifndef _WIN32 +#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) +#else +#define DECLARE_TYPE(__name, ...) decltype(auto) +#endif + void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index c3f9433503..d0619293ac 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -34,7 +34,7 @@ extern void* mklml_dso_handle; #define DYNAMIC_LOAD_MKLML_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using mklmlFunc = decltype(&::__name); \ std::call_once(mklml_dso_flag, []() { \ mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 5d67658b94..751aa54b1a 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle; #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ std::call_once(tensorrt_dso_flag, []() { \ tensorrt_dso_handle = \ diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index 18ed9956f1..bc1977b05d 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -34,7 +34,7 @@ extern void* warpctc_dso_handle; #define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ struct DynLoad__##__name { \ template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using warpctcFunc = decltype(&::__name); \ std::call_once(warpctc_dso_flag, []() { \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \ From b73d7d2f21a4010d10b1a2456e5991d77ed5e01e Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 18 Dec 2018 20:27:14 +0800 Subject: [PATCH 10/13] test=develop --- python/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index fefe8fbaa7..22b9537a90 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -162,7 +162,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF': libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' package_data['paddle.libs']= [] -package_data['paddle.libs']=['libwarpctc' + ext_name] +package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': From aa6e9c30becf0215870fd3633684c97a6d614263 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 19 Dec 2018 03:54:05 +0100 Subject: [PATCH 11/13] [MKL-DNN ]Added transpose/transpose2 Op (#14872) * - Added transpose MKLDNN Op - Few basic UT works - Added 1D transpose - implementing generic mem desc for MKLDNN transpose - Modified trnaspose op to support more dimensional data eg. 5,6..10 - Added is_test attribute to transpose op test=develop * - Added support for MKLDNN::memory::format::any for Transpose MKLDNN op test=develop * - Additional transpose mkldnn op correction to mkldnn layout test=develop * Cosmetic fixes test=develop * - Removed const_cast to obey coding standard test=develop --- paddle/fluid/operators/transpose_mkldnn_op.cc | 124 ++++++++++++++++++ paddle/fluid/operators/transpose_op.cc | 49 ++++++- .../unittests/test_transpose_mkldnn_op.py | 76 +++++++++++ .../tests/unittests/test_transpose_op.py | 13 +- 4 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/transpose_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc new file mode 100644 index 0000000000..37f1cadc7d --- /dev/null +++ b/paddle/fluid/operators/transpose_mkldnn_op.cc @@ -0,0 +1,124 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using framework::DataLayout; + +template +class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + const bool is_test = ctx.Attr("is_test"); + PADDLE_ENFORCE( + is_test == true, + "ConvTransposeMKLDNN works only for inference!. Set is_test = True"); + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + std::vector axis = ctx.Attr>("axis"); + int ndims = axis.size(); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + const T* input_data = input->data(); + + if (ndims == 1) { + output->ShareDataWith(*input); + return; + } + + std::vector nchw_axis(ndims, 0); + for (size_t i = 0; i < nchw_axis.size(); ++i) { + nchw_axis[i] = i; + } + + std::vector nchw_tz = paddle::framework::vectorize2int(input->dims()); + std::string data_format = ctx.Attr("data_format"); + + auto src_md = + input->format() != mkldnn::memory::format::nchw + ? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType(), + input->format()) + : Axis2MemoryDesc(nchw_tz, nchw_axis); + + this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis), + src_md, output, input_data, nchw_tz, mkldnn_engine); + } + + protected: + mkldnn::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, + std::vector& axis) const { + mkldnn_memory_desc_t mem_fmt; + + mem_fmt.primitive_kind = mkldnn_memory; + mem_fmt.ndims = axis.size(); + for (unsigned int i = 0; i < nchw_tz.size(); ++i) { + mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format, + // regardless physical layout) + } + mem_fmt.data_type = mkldnn_f32; + mem_fmt.format = mkldnn_blocked; + + unsigned int total_stride = 1; + for (int i = nchw_tz.size() - 1; i >= 0; --i) { + mem_fmt.layout_desc.blocking.padding_dims[i] = + nchw_tz[i]; // logical dimensions (nchw format, regardless physical + // layout) + mem_fmt.layout_desc.blocking.block_dims[i] = 1; + mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset + mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride; + mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1; + total_stride *= nchw_tz[axis[i]]; + } + mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset + return mem_fmt; + } + + void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o, + mkldnn::memory::desc md_i, Tensor* output, + const T* data_i, std::vector& nchw_dims, + const mkldnn::engine& eng) const { + // Make Memory primitive descriptors + auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng); + auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng); + + auto data_o = output->mutable_data( + place, paddle::memory::Allocator::kDefault, mpd_o.get_size()); + + auto src = mkldnn::memory(mpd_i, (T*)(data_i)); + auto dst = mkldnn::memory(mpd_o, data_o); + + auto r = mkldnn::reorder(src, dst); + mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, + ops::TransposeMKLDNNOpKernel); +REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, + ops::TransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index bc1f59bc1a..b3b379d16f 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -16,6 +16,10 @@ limitations under the License. */ #include #include +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace operators { @@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel { } ctx->SetOutputDim("Out", out_dims); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout_, library_); + } }; class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddInput( "X", "(Tensor) The input tensor, tensors with rank up to 6 are supported."); @@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { "(vector) A list of values, and the size of the list should be " "the same with the input tensor rank. This operator permutes the input " "tensor's axes according to the values given."); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Transpose Operator. @@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout_, library_); } }; diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py new file mode 100644 index 0000000000..61ac879011 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py @@ -0,0 +1,76 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +from test_transpose_op import TestTransposeOp + + +class TestTransposeMKLDNN(TestTransposeOp): + def init_op_type(self): + self.op_type = "transpose2" + self.use_mkldnn = True + self.is_test = True + return + + def test_check_grad(self): + return + + def test_check_grad_no_input(self): + return + + def test_check_grad_no_filter(self): + return + + +class TestCase0MKLDNN(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, ) + self.axis = (0, ) + + +class TestCase1a(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, 4, 5) + self.axis = (0, 2, 1) + + +class TestCase1b(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (3, 4, 5) + self.axis = (2, 1, 0) + + +class TestCase2(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + + +class TestCase3(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) + + +class TestCase4(TestTransposeMKLDNN): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index bbcabb751f..93be9d28da 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -21,15 +21,24 @@ from op_test import OpTest class TestTransposeOp(OpTest): def setUp(self): + self.init_op_type() self.initTestCase() - self.op_type = "transpose2" self.inputs = {'X': np.random.random(self.shape).astype("float32")} - self.attrs = {'axis': list(self.axis)} + self.attrs = { + 'axis': list(self.axis), + 'use_mkldnn': self.use_mkldnn, + 'is_test': self.is_test, + } self.outputs = { 'XShape': np.random.random(self.shape).astype("float32"), 'Out': self.inputs['X'].transpose(self.axis) } + def init_op_type(self): + self.op_type = "transpose2" + self.use_mkldnn = False + self.is_test = False + def test_check_output(self): self.check_output(no_check_set=['XShape']) From b849157e9d3584a8d4b891340706c181c542deb0 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 19 Dec 2018 11:44:48 +0800 Subject: [PATCH 12/13] Add size enforce (#14919) --- .../distributed/brpc_sendrecvop_utils.cc | 23 ++++++++++++++----- .../fluid/operators/distributed/grpc_serde.cc | 8 +++++++ .../operators/distributed/sendrecvop_utils.h | 9 ++++++-- .../distributed/variable_response.cc | 2 +- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc index 6fed9ba92c..e4604db3a3 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #endif #include +#include #include // NOLINT #include "paddle/fluid/framework/data_type.h" @@ -31,7 +32,12 @@ namespace distributed { class IOBufWriter { public: - static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { + static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, + const char* v, int64_t vlen) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + iobuf->append(reinterpret_cast(&k), 4); iobuf->append(reinterpret_cast(&vlen), 8); iobuf->append(v, vlen); @@ -87,6 +93,10 @@ class IOBufWriter { int k, const char* v, int64_t vlen, bool in_cuda_pinned, void (*destroy)(void*), void* user_data) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + #ifdef PADDLE_WITH_BRPC_RDMA IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, destroy, user_data); @@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, request->set_type(::sendrecv::NCCL_ID); const ncclUniqueId& uid = var->Get(); // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(iobuf, + IOBufWriter::Append(name, iobuf, sendrecv::VariableMessage::kSerializedFieldNumber, uid.internal, NCCL_UNIQUE_ID_BYTES); return; @@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, // FIXME(gongwb): it seems that can use zero copy. if (var_is_not_stable) { IOBufWriter::Append( - iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, static_cast(payload->ptr()), payload->memory_size()); } else { if (platform::is_gpu_place(ctx.GetPlace())) { @@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, + IOBufWriter::Append(name, iobuf, + ::sendrecv::VariableMessage::kRowsFieldNumber, reinterpret_cast(slr->rows().data()), static_cast(rows_memory_size)); } diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 299dfe3543..a9dea9cfd2 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload->memory_size()); + if (payload->memory_size() >= std::numeric_limits::max()) { + LOG(FATAL) << "AppendZeroCopy varname:" << name + << ", vlen:" << payload->memory_size(); + } // steal reference of tensor data ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows int num_slices = 2; // only SelectedRows have rows buffer @@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); ProtoEncodeHelper e2(static_cast(buf), 128); + + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 33eded0e6c..6a87178be5 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include #include #include "paddle/fluid/framework/data_type.h" @@ -23,9 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/port.h" - #include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType( } } +template