diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index 63ec515647..b790fa39fe 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -370,4 +370,48 @@ extern void hl_maxout_backward(real* inGrad, size_t featLen, size_t groups); +/** + * @brief Upsample forward. + * @param[in] inputData input data. + * @param[out] maskData the mask data from MaxPoolWithMaskLayer. + * @param[out] batchSize the batch size of the input. + * @param[in] imgSizeH image height. + * @param[in] imgSizeW image width. + * @param[in] channels the input channels. + * @param[in] outputH the output height. + * @param[in] outputW the output widht. + * @param[out] outputData output data. + */ +extern void hl_upsample_forward(real* inputData, + real* maskData, + size_t batchSize, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW, + real* outputData); + +/** + * @brief Upsample backward. + * @param[in] outputGradData the output grad data. + * @param[out] maskData the mask data from MaxPoolWithMaskLayer. + * @param[out] batchSize the batch size of the input. + * @param[in] imgSizeH image height. + * @param[in] imgSizeW image width. + * @param[in] channels the input channels. + * @param[in] outputH the output height. + * @param[in] outputW the output widht. + * @param[out] inputGradData the input grad data. + */ +extern void hl_upsample_backward(real* outputGradData, + real* maskData, + size_t batchSize, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW, + real* inputGradData); + #endif // HL_CNN_H_ diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index c39bd3228d..997eed62e0 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -224,4 +224,24 @@ inline void hl_maxout_backward(real* inGrad, size_t featLen, size_t group) {} +inline void hl_upsample_forward(real* inputData, + real* maskData, + size_t batchSize, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW, + real* outputData) {} + +inline void hl_upsample_backward(real* outputGradData, + real* maskData, + size_t batchSize, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW, + real* inputGradData) {} + #endif // HL_CNN_STUB_H_ diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index a4459243e8..bac743a293 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -1028,3 +1028,79 @@ void hl_maxout_backward(real* inGrad, num_kernels, inGrad, outGrad, idData, size, featLen, groups); CHECK_SYNC("hl_maxout_backward failed"); } + +__global__ void upsampleForwardCompute(real* input_data, + real* mask_data, + size_t nthreads, + size_t in_h, + size_t in_w, + size_t out_h, + size_t out_w, + real* output_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < nthreads) { + int offset = index / (in_w * in_h) * out_h * out_w; + int upsample_idx = static_cast(mask_data[index]); + output_data[offset + upsample_idx] = input_data[index]; + } +} + +__global__ void upsampleBackwardCompute(real* out_grad, + real* mask_data, + size_t nthreads, + size_t in_h, + size_t in_w, + size_t out_h, + size_t out_w, + real* input_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < nthreads) { + int offset = index / (in_w * in_h) * out_h * out_w; + int upsample_idx = static_cast(mask_data[index]); + input_grad[index] = out_grad[offset + upsample_idx]; + } +} + +void hl_upsample_forward(real* inputData, + real* maskData, + size_t batchSize, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW, + real* outputData) { + int num_kernels = batchSize * imgSizeH * imgSizeW * channels; + int blocks = (num_kernels + 1024 - 1) / 1024; + upsampleForwardCompute<<>>(inputData, + maskData, + num_kernels, + imgSizeH, + imgSizeW, + outputH, + outputW, + outputData); + CHECK_SYNC("hl_upsample_forward failed"); +} + +void hl_upsample_backward(real* outputGradData, + real* maskData, + size_t batchSize, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW, + real* inputGradData) { + int num_kernels = batchSize * imgSizeH * imgSizeW * channels; + int blocks = (num_kernels + 1024 - 1) / 1024; + upsampleBackwardCompute<<>>(outputGradData, + maskData, + num_kernels, + imgSizeH, + imgSizeW, + outputH, + outputW, + inputGradData); + CHECK_SYNC("hl_upsample_backward failed"); +} diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 64c06687b6..16a118090b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -279,6 +279,21 @@ std::unique_ptr Executor::Prepare( return std::unique_ptr(ctx); } +std::vector> Executor::Prepare( + const ProgramDesc& program, const std::vector& block_ids) { + std::vector> result; + for (auto& bid : block_ids) { + auto* ctx = new ExecutorPrepareContext(program, bid); + PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); + auto& block = program.Block(bid); + for (auto& op_desc : block.AllOps()) { + ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); + } + result.push_back(std::shared_ptr(ctx)); + } + return result; +} + void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope, bool create_vars) { auto& block = ctx->prog_.Block(ctx->block_id_); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 7173c51c95..d7c99165f0 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -61,6 +61,9 @@ class Executor { static std::unique_ptr Prepare( const ProgramDesc& program, int block_id); + static std::vector> Prepare( + const ProgramDesc& program, const std::vector& block_ids); + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope = true, bool create_vars = true); diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 7703658899..78e1d274a9 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -14,8 +14,6 @@ #include "paddle/fluid/operators/detail/variable_response.h" -#include - #include #include #include diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc new file mode 100644 index 0000000000..9c704a2949 --- /dev/null +++ b/paddle/fluid/operators/fc_mkldnn_op.cc @@ -0,0 +1,303 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/fc_op.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; +using paddle::platform::MKLDNNDeviceContext; + +template +class MKLDNNMD { + public: + explicit MKLDNNMD(const T* in, const T* w, bool bias) + : in{paddle::framework::vectorize2int(in->dims())}, + w{paddle::framework::vectorize2int(w->dims())} { + with_bias_ = bias; + } + + mkldnn::memory::desc dst() const { + return platform::MKLDNNMemDesc({in[0], w[1]}, + mkldnn::memory::data_type::f32, + mkldnn::memory::format::nc); + } + + mkldnn::memory::desc src() const { + return is_spatial() + ? platform::MKLDNNMemDesc({in[0], in[1], in[2], in[3]}, + mkldnn::memory::data_type::f32, + mkldnn::memory::format::nchw) + : platform::MKLDNNMemDesc({in[0], in[1]}, + mkldnn::memory::data_type::f32, + mkldnn::memory::format::nc); + } + + mkldnn::memory::desc weights() const { + return is_spatial() + ? platform::MKLDNNMemDesc({w[1], in[1], in[2], in[3]}, + mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw) + : platform::MKLDNNMemDesc({w[1], in[1]}, + mkldnn::memory::data_type::f32, + mkldnn::memory::format::oi); + } + + mkldnn::memory::desc bias() const { + return with_bias_ + ? platform::MKLDNNMemDesc({w[1]}, mkldnn::memory::data_type::f32, + mkldnn::memory::format::format_undef) + : platform::MKLDNNMemDesc({}, mkldnn::memory::data_type::f32, + mkldnn::memory::format::format_undef); + } + + private: + bool is_spatial() const { return in.size() > 1 && w.size() > 1; } + + std::vector in; + std::vector w; + bool with_bias_; + bool is_spatial_; +}; + +class MKLDNNMemory { + public: + MKLDNNMemory(MKLDNNMD* t, const mkldnn::engine& e) + : md_{t}, engine_{e} {} + virtual ~MKLDNNMemory() = default; + + template + mkldnn::memory dst(const Output* out) { + return mkldnn::memory({md_->dst(), engine_}, + static_cast(const_cast(out))); + } + + template + mkldnn::memory dst(Output* out) { + return mkldnn::memory({md_->dst(), engine_}, out); + } + + template + mkldnn::memory src(const Input* in) { + return mkldnn::memory({md_->src(), engine_}, + static_cast(const_cast(in))); + } + + template + mkldnn::memory weights(const Weight* w) { + return mkldnn::memory({md_->weights(), engine_}, + static_cast(const_cast(w))); + } + + mkldnn::memory bias() { + return mkldnn::memory(mkldnn::memory::primitive_desc(md_->bias(), engine_)); + } + + private: + MKLDNNMD* md_; + const mkldnn::engine& engine_; +}; + +template +class FCMKLDNNOpKernel : public paddle::framework::OpKernel { + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto input = ctx.Input("Input"); + auto w = ctx.Input("W"); + + PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4, + "Input must be with 2 or 4 dimensions, i.e. NCHW"); + PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4, + "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW"); + + bool with_bias = ctx.Attr("bias_attr"); + MKLDNNMD md(input, w, with_bias); + + std::shared_ptr pd = + FcFwdPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(), + with_bias, mkldnn_engine); + + const std::string key = ctx.op().Output("Out"); + const std::string key_fc_pd = key + "@fc_pd"; + + dev_ctx.SetBlob(key_fc_pd, pd); + + MKLDNNMemory mem(&md, mkldnn_engine); + + const T* input_data = input->data(); + const T* w_data = w->data(); + + auto output = ctx.Output("Out"); + T* output_data = output->mutable_data(ctx.GetPlace()); + + auto dst_memory = mem.dst(output_data); + auto src_memory = mem.src(input_data); + auto weights_memory = mem.weights(w_data); + auto bias_memory = mem.bias(); + + auto forward = with_bias ? mkldnn::inner_product_forward( + *pd, src_memory, weights_memory, bias_memory, + dst_memory) + : mkldnn::inner_product_forward( + *pd, src_memory, weights_memory, dst_memory); + + std::vector pipeline = {forward}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + + private: + std::unique_ptr + FcFwdPrimitiveDesc(const mkldnn::memory::desc& src, + const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& dst, + const mkldnn::memory::desc& bias, const bool with_bias, + const mkldnn::engine& engine) const { + auto desc = with_bias + ? mkldnn::inner_product_forward::desc( + mkldnn::prop_kind::forward, src, weights, bias, dst) + : mkldnn::inner_product_forward::desc( + mkldnn::prop_kind::forward, src, weights, dst); + + auto pd = new mkldnn::inner_product_forward::primitive_desc(desc, engine); + return std::unique_ptr(pd); + } +}; + +template +class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + T* input_grad_data = nullptr; + T* w_grad_data = nullptr; + + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* w_grad = ctx.Output(framework::GradVarName("W")); + + if (input_grad) { + input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + } + if (w_grad) { + w_grad_data = w_grad->mutable_data(ctx.GetPlace()); + } + + const Tensor* input = ctx.Input("Input"); + const T* input_data = input->data(); + + const Tensor* w = ctx.Input("W"); + const T* w_data = w->data(); + + const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); + const T* out_grad_data = out_grad->data(); + + bool with_bias = ctx.Attr("bias_attr"); + + MKLDNNMD md(input, w, with_bias); + MKLDNNMemory mem(&md, mkldnn_engine); + + auto dst_memory = mem.dst(out_grad_data); + auto src_memory = mem.src(input_data); + auto weights_memory = mem.weights(w_data); + auto bias_memory = mem.bias(); + + const std::string key = ctx.op().Input("Out"); + const std::string key_fc_pd = key + "@fc_pd"; + + auto pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_fc_pd)); + + PADDLE_ENFORCE(pd != nullptr, "Fail to find key_fc_pd in device context"); + + if (w_grad) { + auto weights_grad_memory = mem.weights(w_grad_data); + + mkldnn::inner_product_backward_weights::primitive_desc bwd_weight_pd = + FcBwdWeightsPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(), + with_bias, *pd, mkldnn_engine); + + auto bwd_weights_prim = mkldnn::inner_product_backward_weights( + bwd_weight_pd, src_memory, dst_memory, weights_grad_memory, + bias_memory); + + std::vector pipeline{bwd_weights_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + + if (input_grad) { + auto src_grad_memory = mem.src(input_grad_data); + + mkldnn::inner_product_backward_data::primitive_desc bwd_data_pd = + FcBwdDataPrimitiveDesc(md.src(), md.weights(), md.dst(), *pd, + mkldnn_engine); + + auto bwd_data_prim = mkldnn::inner_product_backward_data( + bwd_data_pd, dst_memory, weights_memory, src_grad_memory); + + std::vector pipeline{bwd_data_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + } + + private: + mkldnn::inner_product_backward_weights::primitive_desc + FcBwdWeightsPrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, + const mkldnn::memory::desc& diff_dst, const mkldnn::memory::desc& bias, + const bool with_bias, + const mkldnn::inner_product_forward::primitive_desc& pd, + const mkldnn::engine& engine) const { + auto bwd_weight_desc = with_bias + ? mkldnn::inner_product_backward_weights::desc( + src, diff_weights, bias, diff_dst) + : mkldnn::inner_product_backward_weights::desc( + src, diff_weights, bias, diff_dst); + + return mkldnn::inner_product_backward_weights::primitive_desc( + bwd_weight_desc, engine, pd); + } + + mkldnn::inner_product_backward_data::primitive_desc FcBwdDataPrimitiveDesc( + const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& diff_dst, + const mkldnn::inner_product_forward::primitive_desc& pd, + const mkldnn::engine& engine) const { + auto bwd_data_desc = + mkldnn::inner_product_backward_data::desc(diff_src, weights, diff_dst); + return mkldnn::inner_product_backward_data::primitive_desc(bwd_data_desc, + engine, pd); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::FCMKLDNNOpKernel); + +REGISTER_OP_KERNEL(fc_grad, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::FCMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc new file mode 100644 index 0000000000..381771f157 --- /dev/null +++ b/paddle/fluid/operators/fc_op.cc @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fc_op.h" +#include + +namespace paddle { +namespace operators { + +void FCOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "X(Input) of Fully Connected should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Out(Output) of Fully Connected should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "W(Input) of Fully Connected should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto w_dims = ctx->GetInputDim("W"); + std::vector output_shape({in_dims[0], w_dims[1]}); + + PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, + "Fully Connected input should be 2-D or 4-D tensor."); + + PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4, + "Fully Connected input should be 2-D or 4-D tensor."); + + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + ctx->ShareLoD("Input", "Out"); +} + +framework::OpKernelType FCOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library{framework::LibraryType::kMKLDNN}; + framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout, library); +} + +void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto w_dims = ctx->GetInputDim("W"); + + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("W"))) { + ctx->SetOutputDim(framework::GradVarName("W"), w_dims); + } +} + +framework::OpKernelType FCOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library{framework::LibraryType::kMKLDNN}; + framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout, library); +} + +FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", "(Tensor) The input tensor of fully connected operator. "); + AddInput("W", "(Tensor), The second input tensor of fc op."); + AddOutput("Out", "(Tensor) The output tensor of fully connected operator. "); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("bias_attr", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddComment(R"DOC( + Fully Connected Operator. + + The fully connected operation calculates the output based on the input, weights and bias attribute. + The size of each dimension of the parameters checked in the infer-shape. + The matrix of bias is generated by the mkldnn framework, when the bias_attr is True. + Additional parametrs are use_mkldnn and bias_attr. + The input(X) size and output(Out) size may be diffrent. + + The fully connected layer only supports MKLDNN version +)DOC"); +} + +} // namespace operators +} // namespace paddle + +REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad, + paddle::operators::FCOpGrad); diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h new file mode 100644 index 0000000000..70fa96440d --- /dev/null +++ b/paddle/fluid/operators/fc_op.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class FCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FCOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FCOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index b19add24e2..91a1f226cd 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -45,20 +45,23 @@ static void CreateTensorFromMessageType(framework::Variable *var, } } -static void ParallelExecuteBlocks(const std::vector ¶llel_blkids, - framework::Executor *executor, - framework::ProgramDesc *program, - framework::Scope *scope) { +static void ParallelExecuteBlocks( + const std::vector ¶llel_blkids, framework::Executor *executor, + const std::vector> + &prepared, + framework::ProgramDesc *program, framework::Scope *scope) { std::vector> fs; for (size_t idx : parallel_blkids) { - fs.push_back(framework::Async([&executor, &program, &scope, idx]() { - int run_block = idx; // thread local - try { - executor->Run(*program, scope, run_block, false, false); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - })); + fs.push_back( + framework::Async([&executor, &prepared, &program, &scope, idx]() { + int run_block = idx; // thread local + try { + executor->RunPreparedContext(prepared[run_block].get(), scope, + false, false); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + })); } for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } @@ -96,11 +99,18 @@ class ListenAndServOp : public framework::OperatorBase { auto *block = Attr(kOptimizeBlock); auto *program = block->Program(); - int num_blocks = program->Size(); + size_t num_blocks = program->Size(); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); framework::Executor executor(dev_place); + std::vector block_list; + for (size_t blkid = 1; blkid < num_blocks; ++blkid) + block_list.push_back(blkid); + auto prepared = executor.Prepare(*program, block_list); + prepared.insert( + prepared.begin(), + std::shared_ptr(nullptr)); // TODO(qiao) set proper fields for table lookup and update rpc_service_->SetExecutor(&executor); @@ -153,21 +163,22 @@ class ListenAndServOp : public framework::OperatorBase { // The optimize blocks which have the same parent ID would run parallel // TODO(Yancey1989): need to use ParallelExecutor for future - size_t last_parent_blkid = program->Block(1).Parent(); + int32_t last_parent_blkid = program->Block(1).Parent(); std::vector parallel_blkids; parallel_blkids.push_back(1); double ts = detail::GetTimestamp(); for (size_t blkid = 2; blkid < num_blocks; ++blkid) { if (program->Block(blkid).Parent() != last_parent_blkid) { for (size_t idx : parallel_blkids) VLOG(3) << idx; - ParallelExecuteBlocks(parallel_blkids, &executor, program, + ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, &recv_scope); parallel_blkids.clear(); last_parent_blkid = program->Block(blkid).Parent(); } parallel_blkids.push_back(blkid); } - ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); + ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, + &recv_scope); VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; @@ -181,7 +192,8 @@ class ListenAndServOp : public framework::OperatorBase { var->GetMutable()->mutable_rows()->clear(); } rpc_service_->SetCond(1); - // FIXME(typhoonzero): use another condition to sync wait clients get. + // NOTE: does not consider barrier request retry in here, we may use + // global barrier id to resolve this. rpc_service_->WaitClientGet(fan_in); sparse_vars.clear(); } // while(true) diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index 3e750ed2d1..d36ed398eb 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -30,19 +30,16 @@ class SplitIdsOpKernel : public framework::OpKernel { PADDLE_THROW("SplitIds do not support GPU kernel"); } - const auto* ids_t = ctx.Input("Ids"); - auto& ids_dims = ids_t->dims(); + auto& ids_dims = ctx.Input("Ids")->dims(); + const T* ids = ctx.Input("Ids")->data(); auto outs = ctx.MultiOutput("Out"); - - const T* ids = ids_t->data(); - const size_t shard_num = outs.size(); std::vector> out_ids; out_ids.resize(outs.size()); // split id by their shard_num. - for (size_t i = 0; i < ids_dims[0]; ++i) { + for (int i = 0; i < ids_dims[0]; ++i) { T id = ids[i]; size_t shard_id = static_cast(id) % shard_num; out_ids[shard_id].push_back(id); diff --git a/paddle/gserver/layers/UpsampleLayer.cpp b/paddle/gserver/layers/UpsampleLayer.cpp new file mode 100644 index 0000000000..3ff5332e64 --- /dev/null +++ b/paddle/gserver/layers/UpsampleLayer.cpp @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. */ + +#include "UpsampleLayer.h" +#include "iostream" + +namespace paddle { + +REGISTER_LAYER(upsample, UpsampleLayer); + +size_t UpsampleLayer::getOutputSize() { + if (upsampleSize_ == 0) { + upsampleSize_ = imgSize_ * scale_ - static_cast(padOutX_); + upsampleSizeY_ = imgSizeY_ * scaleY_ - static_cast(padOutY_); + } + return upsampleSize_ * upsampleSizeY_ * channels_; +} + +bool UpsampleLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 2U); + CHECK_EQ(config_.inputs_size(), 2); + const auto& conf = config_.inputs(0).upsample_conf(); + const auto& img_conf = conf.image_conf(); + + imgSizeY_ = + img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(); + imgSize_ = img_conf.img_size(); + channels_ = img_conf.channels(); + + CHECK((conf.has_upsample_size()) || (conf.has_scale())) + << "scale or upsample_size is required."; + + if (conf.has_upsample_size()) { + upsampleSize_ = conf.upsample_size(); + upsampleSizeY_ = upsampleSize_; + if (conf.has_upsample_size_y()) { + upsampleSizeY_ = conf.upsample_size_y(); + } + } else { + if (!conf.has_scale_y()) { + scale_ = scaleY_ = conf.scale_y(); + CHECK_GT(static_cast(scale_), 1); + } else { + scale_ = conf.scale(); + scaleY_ = conf.scale_y(); + } + padOutX_ = conf.pad_out_x(); + padOutY_ = conf.pad_out_y(); + CHECK(!padOutX_ || scale_ == 2) + << "Output height padding compensation requires scale_ == 2"; + CHECK(!padOutY_ || scaleY_ == 2) + << "Output width padding compensation requires scaleY_ == 2"; + upsampleSize_ = upsampleSizeY_ = 0; + } + return true; +} + +void UpsampleLayer::forward(PassType passType) { + Layer::forward(passType); + + MatrixPtr input = getInputValue(0); + MatrixPtr mask = inputLayers_[1]->getOutput("mask").value; + + size_t batchSize = input->getHeight(); + size_t outSize = getOutputSize(); + + CHECK_EQ(input->getWidth(), mask->getWidth()); + CHECK_EQ(mask->getHeight(), batchSize); + resetOutput(batchSize, outSize); + + MatrixPtr output = getOutputValue(); + output->upsampleForward(*input, + *mask, + imgSize_, + imgSizeY_, + channels_, + upsampleSize_, + upsampleSizeY_); +} + +void UpsampleLayer::backward(const UpdateCallback& callback) { + MatrixPtr mask = inputLayers_[1]->getOutput("mask").value; + MatrixPtr inputGrad = getInputGrad(0); + MatrixPtr outputGrad = getOutputGrad(); + inputGrad->upsampleBackward(*outputGrad, + *mask, + imgSize_, + imgSizeY_, + channels_, + upsampleSize_, + upsampleSizeY_); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/UpsampleLayer.h b/paddle/gserver/layers/UpsampleLayer.h new file mode 100644 index 0000000000..25efbac5e9 --- /dev/null +++ b/paddle/gserver/layers/UpsampleLayer.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "Layer.h" +#include "paddle/math/Matrix.h" +#include "paddle/utils/Logging.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +/** + * This layer transpose the pooling process. + * It takes two input, the first input is the input data, and + * the second is the mask data from the max-pool-with-mask layer. + * + */ + +class UpsampleLayer : public Layer { +public: + explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {} + ~UpsampleLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback) override; + + size_t getOutputSize(); + +protected: + size_t scale_, scaleY_; + size_t upsampleSize_, upsampleSizeY_; + size_t padOutX_, padOutY_; + size_t imgSize_, imgSizeY_; + size_t channels_; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index b578a906c2..9839375c22 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -27,6 +27,7 @@ gserver_test(test_BatchNorm) gserver_test(test_KmaxSeqScore) gserver_test(test_Expand) gserver_test(test_MaxPoolingWithMaskOutput) +gserver_test(test_Upsample) set(PYTHON_PATH ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d diff --git a/paddle/gserver/tests/test_Upsample.cpp b/paddle/gserver/tests/test_Upsample.cpp new file mode 100644 index 0000000000..9d6fa1d130 --- /dev/null +++ b/paddle/gserver/tests/test_Upsample.cpp @@ -0,0 +1,152 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "LayerGradUtil.h" +#include "paddle/math/MathUtils.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; + +void setPoolConfig(TestConfig* config, + PoolConfig* pool, + const string& poolType) { + (*config).biasSize = 0; + (*config).layerConfig.set_type("pool"); + (*config).layerConfig.set_num_filters(1); + + int kw = 2, kh = 2; + int pw = 0, ph = 0; + int sw = 2, sh = 2; + pool->set_pool_type(poolType); + pool->set_channels(2); + pool->set_size_x(kw); + pool->set_size_y(kh); + pool->set_start(0); + pool->set_padding(pw); + pool->set_padding_y(ph); + pool->set_stride(sw); + pool->set_stride_y(sh); + + int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false); + int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false); + pool->set_output_x(ow); + pool->set_output_y(oh); +} + +LayerPtr doOneUpsampleTest(MatrixPtr& inputMat, + const string& poolType, + bool use_gpu, + real* tempGradData) { + /* prepare maxPoolWithMaskLayer */ + TestConfig config; + config.inputDefs.push_back({INPUT_DATA, "layer_0", 128, 0}); + LayerInputConfig* input = config.layerConfig.add_inputs(); + PoolConfig* pool = input->mutable_pool_conf(); + + pool->set_img_size(8); + pool->set_img_size_y(8); + setPoolConfig(&config, pool, "max-pool-with-mask"); + config.layerConfig.set_size(pool->output_x() * pool->output_y() * + pool->channels()); + + config.layerConfig.set_name("MaxPoolWithMask"); + + std::vector dataLayers; + LayerMap layerMap; + vector datas; + + initDataLayer(config, + &dataLayers, + &datas, + &layerMap, + "MaxPoolWithMask", + 1, + false, + use_gpu); + + dataLayers[0]->getOutputValue()->copyFrom(*inputMat); + + FLAGS_use_gpu = use_gpu; + std::vector parameters; + LayerPtr maxPoolingWithMaskOutputLayer; + initTestLayer(config, &layerMap, ¶meters, &maxPoolingWithMaskOutputLayer); + maxPoolingWithMaskOutputLayer->forward(PASS_GC); + + /* prepare the upsample layer */ + LayerConfig upsampleLayerConfig; + upsampleLayerConfig.set_type("upsample"); + LayerInputConfig* input1 = upsampleLayerConfig.add_inputs(); + upsampleLayerConfig.add_inputs(); + + UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf(); + upsampleConfig->set_scale(2); + ImageConfig* imageConfig = upsampleConfig->mutable_image_conf(); + imageConfig->set_channels(2); + imageConfig->set_img_size(4); + imageConfig->set_img_size_y(4); + upsampleLayerConfig.set_size(2 * 8 * 8); + upsampleLayerConfig.set_name("upsample"); + + for (size_t i = 0; i < 2; i++) { + LayerInputConfig& inputTemp = *(upsampleLayerConfig.mutable_inputs(i)); + inputTemp.set_input_layer_name("MaxPoolWithMask"); + } + + LayerPtr upsampleLayer; + ParameterMap parameterMap; + upsampleLayer = Layer::create(upsampleLayerConfig); + layerMap[upsampleLayerConfig.name()] = upsampleLayer; + upsampleLayer->init(layerMap, parameterMap); + upsampleLayer->setNeedGradient(true); + upsampleLayer->forward(PASS_GC); + upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128); + upsampleLayer->backward(); + + return upsampleLayer; +} + +TEST(Layer, maxPoolingWithMaskOutputLayerFwd) { + bool useGpu = false; + MatrixPtr inputMat; + MatrixPtr inputGPUMat; + MatrixPtr tempGradMat; + + inputMat = Matrix::create(1, 128, false, useGpu); + inputMat->randomizeUniform(); + + tempGradMat = Matrix::create(1, 128, false, useGpu); + tempGradMat->randomizeUniform(); + real* data = inputMat->getData(); + real* tempGradData = tempGradMat->getData(); + + LayerPtr upsampleLayerCPU = + doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData); + +#ifdef PADDLE_WITH_CUDA + useGpu = true; + inputGPUMat = Matrix::create(1, 128, false, useGpu); + inputGPUMat->copyFrom(data, 128); + LayerPtr upsampleLayerGPU = doOneUpsampleTest( + inputGPUMat, "max-pool-with-mask", useGpu, tempGradData); + checkMatrixEqual(upsampleLayerCPU->getOutput("").value, + upsampleLayerGPU->getOutput("").value); + + checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(), + upsampleLayerGPU->getPrev(0)->getOutputGrad()); +#endif +} diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 35359d4b5a..0e84cb3739 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -1024,6 +1024,66 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) { LOG(INFO) << "the diffCnt is " << diffCnt; } +void GpuMatrix::upsampleForward(Matrix& input, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW) { + CHECK(input.useGpu_ == true) << "Matrix type are not equal"; + CHECK(mask.useGpu_ == true) << "Matrix type are not equal"; + + real* inputData = input.getData(); + real* maskData = mask.getData(); + real* outData = data_; + + size_t batch = input.getHeight(); + + CHECK(imgSizeH * imgSizeW * channels == input.getWidth()); + CHECK(imgSizeH * imgSizeW * channels == mask.getWidth()); + CHECK_EQ(batch, this->getHeight()); + CHECK(width_ == outputH * outputW * channels); + hl_upsample_forward(inputData, + maskData, + batch, + imgSizeH, + imgSizeW, + channels, + outputH, + outputW, + outData); +} + +void GpuMatrix::upsampleBackward(Matrix& outputGrad, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW) { + CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal"; + CHECK(mask.useGpu_ == true) << "Matrix type are not equal"; + + real* outputGradData = outputGrad.getData(); + real* maskData = mask.getData(); + real* inputGradData = data_; + size_t batch = outputGrad.getHeight(); + + CHECK(imgSizeH * imgSizeW == this->getWidth() / channels); + CHECK_EQ(batch, this->getHeight()); + CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth()); + hl_upsample_backward(outputGradData, + maskData, + batch, + imgSizeH, + imgSizeW, + channels, + outputH, + outputW, + inputGradData); +} + void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, @@ -1986,6 +2046,72 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) { CHECK_EQ(info, 0); } +void CpuMatrix::upsampleForward(Matrix& input, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW) { + real* inputData = input.getData(); + real* maskData = mask.getData(); + real* outData = data_; + size_t inLength = imgSizeH * imgSizeW; + size_t outLength = outputH * outputW; + size_t batch = input.getHeight(); + CHECK(inLength == input.getWidth() / channels); + CHECK_EQ(batch, this->getHeight()); + CHECK_EQ(channels * outLength, this->getWidth()); + + for (size_t k = 0; k < batch; k++) { + for (size_t c = 0; c < channels; c++) { + for (size_t i = 0; i < inLength; i++) { + size_t out_index = static_cast(maskData[i]); + if (out_index >= outLength) { + LOG(FATAL) << "upsample index " << out_index << " out of range."; + } + outData[out_index] = inputData[i]; + } + inputData += inLength; + maskData += inLength; + outData += outLength; + } + } +} + +void CpuMatrix::upsampleBackward(Matrix& outputGrad, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW) { + real* outputGradData = outputGrad.getData(); + real* maskData = mask.getData(); + real* inputGradData = data_; + size_t inLength = imgSizeH * imgSizeW; + size_t outLength = outputH * outputW; + size_t batch = outputGrad.getHeight(); + CHECK(inLength == this->getWidth() / channels); + CHECK_EQ(batch, this->getHeight()); + CHECK_EQ(channels * outLength, outputGrad.getWidth()); + + for (size_t k = 0; k < batch; k++) { + for (size_t c = 0; c < channels; c++) { + for (size_t i = 0; i < inLength; i++) { + size_t out_index = static_cast(maskData[i]); + if (out_index >= outLength) { + LOG(FATAL) << "upsample index " << out_index << " out of range."; + } + inputGradData[i] = outputGradData[out_index]; + } + inputGradData += inLength; + maskData += inLength; + outputGradData += outLength; + } + } +} + void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 631e69edc1..04e9614eab 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -859,6 +859,26 @@ public: LOG(FATAL) << "Not implemented"; } + virtual void upsampleForward(Matrix& input, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW) { + LOG(FATAL) << "Not implemeted"; + } + + virtual void upsampleBackward(Matrix& outputGrad, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW) { + LOG(FATAL) << "Not implemeted"; + } + /** * Pooling forward operation, pick out the largest element * in the sizeX of value, if the maskMatP is not NULL, it will @@ -1420,6 +1440,22 @@ public: void classificationError(Matrix& output, IVector& label, size_t topkSize = 1); + void upsampleForward(Matrix& input, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW); + + void upsampleBackward(Matrix& outputGrad, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW); + void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, @@ -1694,6 +1730,22 @@ public: MatrixPtr clone(size_t height, size_t width, bool useGpu = false); + void upsampleForward(Matrix& input, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW); + + void upsampleBackward(Matrix& outputGrad, + Matrix& mask, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t outputH, + size_t outputW); + void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index d699984ff2..d78ee9c9f3 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -323,6 +323,16 @@ message ClipConfig { required double max = 2; } +message UpsampleConfig { + required ImageConfig image_conf = 1; + optional uint32 scale = 2 [ default = 2 ]; + optional uint32 scale_y = 3 [ default = 2 ]; + optional bool pad_out_x = 4 [ default = false ]; + optional bool pad_out_y = 5 [ default = false ]; + optional uint32 upsample_size = 6; + optional uint32 upsample_size_y = 7; +} + message ROIPoolConfig { required uint32 pooled_width = 1; required uint32 pooled_height = 2; @@ -359,6 +369,7 @@ message LayerInputConfig { optional ClipConfig clip_conf = 18; optional ScaleSubRegionConfig scale_sub_region_conf = 19; optional ROIPoolConfig roi_pool_conf = 20; + optional UpsampleConfig upsample_conf = 21; } message LayerConfig { diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3d13133bf2..d2e7d58524 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -133,6 +133,8 @@ def fc(input, bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias of this layer. If it is set to None, no bias will be added to the output units. act (str, default None): Activation to be applied to the output of this layer. + use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn + library is installed. Default: False name (str, default None): The name of this layer. Returns: @@ -153,38 +155,64 @@ def fc(input, dtype = helper.input_dtype() mul_results = [] - for input_var, param_attr in helper.iter_inputs_and_params(): - input_shape = input_var.shape + if use_mkldnn: + tmp = helper.create_tmp_variable(dtype) + input_shape = input.shape param_shape = [ reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) ] + [size] w = helper.create_parameter( - attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) - tmp = helper.create_tmp_variable(dtype) + attr=helper.param_attr, + shape=param_shape, + dtype=dtype, + is_bias=False) + if bias_attr is None or bias_attr is False: + bias_attr = False + else: + bias_attr = True helper.append_op( - type="mul", - inputs={"X": input_var, - "Y": w}, + type="fc", + inputs={"Input": input, + "W": w}, outputs={"Out": tmp}, - attrs={ - "x_num_col_dims": num_flatten_dims, - "y_num_col_dims": 1, - 'use_mkldnn': use_mkldnn - }) - mul_results.append(tmp) - - # sum - if len(mul_results) == 1: - pre_bias = mul_results[0] + attrs={"use_mkldnn": use_mkldnn, + "bias_attr": bias_attr}) + return helper.append_activation(tmp) else: - pre_bias = helper.create_tmp_variable(dtype) - helper.append_op( - type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) - # add bias - pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims) - # add activation - return helper.append_activation(pre_activation) + for input_var, param_attr in helper.iter_inputs_and_params(): + input_shape = input_var.shape + param_shape = [ + reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) + ] + [size] + + w = helper.create_parameter( + attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) + tmp = helper.create_tmp_variable(dtype) + helper.append_op( + type="mul", + inputs={"X": input_var, + "Y": w}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": num_flatten_dims, + "y_num_col_dims": 1, + }) + mul_results.append(tmp) + + if len(mul_results) == 1: + pre_bias = mul_results[0] + else: + pre_bias = helper.create_tmp_variable(dtype) + helper.append_op( + type="sum", + inputs={"X": mul_results}, + outputs={"Out": pre_bias}) + # add bias + pre_activation = helper.append_bias_op( + pre_bias, dim_start=num_flatten_dims) + # add activation + return helper.append_activation(pre_activation) def embedding(input, diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py new file mode 100644 index 0000000000..3f547f3c48 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fc_op.py @@ -0,0 +1,99 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +def fully_connected_naive(input, weights, bias_data=None): + in_n, in_c, in_h, in_w = input.shape + w_h, w_c = weights.shape + + x_data = np.reshape(input, [in_n, in_c * in_h * in_w]) + w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w))) + result = None + + if not bias_data: + result = np.dot(x_data, w_data) + else: + result = np.dot(x_data, w_data) + bias_data + + return result + + +class MatrixGenerate: + def __init__(self, mb, ic, oc, h, w): + self.input = np.random.random((mb, ic, h, w)).astype("float32") + self.weights = np.random.random((ic * h * w, oc)).astype("float32") + + +class TestFCMKLDNNOp(OpTest): + def setUp(self): + self.op_type = "fc" + self.use_mkldnn = True + self.with_bias = True + self.matrix = MatrixGenerate(1, 10, 15, 3, 3) + + self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights} + + self.attrs = { + 'use_mkldnn': self.use_mkldnn, + 'with_bias': self.with_bias + } + + self.outputs = { + 'Out': fully_connected_naive(self.matrix.input, self.matrix.weights) + } + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(set(['Input', 'W']), 'Out', max_relative_error=0.9) + + def test_check_grad_no_weight(self): + self.check_grad( + ['Input'], 'Out', max_relative_error=0.5, no_grad_set=set('W')) + + +class TestFCMKLDNNOp1(TestFCMKLDNNOp): + def init_op_type(self): + self.matrix = MatrixGenerate(2, 15, 48, 2, 2) + + +class TestFCMKLDNNOp2(TestFCMKLDNNOp): + def init_op_type(self): + self.matrix = MatrixGenerate(2, 32, 40, 1, 1) + + +class TestFCMKLDNNOp3(TestFCMKLDNNOp): + def init_op_type(self): + self.matrix = MatrixGenerate(2, 2, 4, 1, 1) + + +class TestFCMKLDNNOp4(TestFCMKLDNNOp): + def init_op_type(self): + self.with_bias = False + self.matrix = MatrixGenerate(2, 32, 48, 2, 2) + + +class TestFCMKLDNNOp4(TestFCMKLDNNOp): + def init_op_type(self): + self.with_bias = False + self.matrix = MatrixGenerate(2, 32, 1000, 6, 6) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 186b91c226..460eb3b349 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -471,6 +471,7 @@ class Input(Cfg): maxout=None, spp=None, pad=None, + upsample=None, format=None, nnz=None, is_static=None, @@ -983,6 +984,13 @@ class Pad(Cfg): self.add_keys(locals()) +@config_class +class Upsample(Cfg): + def __init__(self, scale, scale_y, pad_out_x, pad_out_y, upsample_size, + upsample_size_y): + self.add_keys(locals()) + + @config_class class Norm(Cfg): def __init__(self, @@ -2380,6 +2388,46 @@ class SpatialPyramidPoolLayer(LayerBase): self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels) +@config_layer('upsample') +class UpsampleLayer(LayerBase): + def __init__(self, name, inputs, **xargs): + super(UpsampleLayer, self).__init__( + name, 'upsample', 0, inputs=inputs, **xargs) + + input_layer = self.get_input_layer(0) + image_conf = self.config.inputs[0].upsample_conf.image_conf + image_conf.img_size = input_layer.width + image_conf.img_size_y = input_layer.height + image_conf.channels = input_layer.size / (input_layer.width * + input_layer.height) + + upsample = self.inputs[0].upsample + output_x = 0 + output_y = 0 + output_size = 0 + + if upsample.scale: + self.config.inputs[0].upsample_conf.scale = upsample.scale + self.config.inputs[0].upsample_conf.scale_y = upsample.scale_y + output_x = input_layer.width * upsample.scale + output_y = input_layer.height * upsample.scale_y + self.config.inputs[0].upsample_conf.pad_out_x = upsample.pad_out_x + self.config.inputs[0].upsample_conf.pad_out_y = upsample.pad_out_y + if upsample.upsample_size: + self.config.inputs[ + 0].upsample_conf.upsample_size = upsample.upsample_size + self.config.inputs[ + 0].upsample_conf.upsample_size_y = upsample.upsample_size_y + output_x = upsample.upsample_size + output_y = upsample.upsample_size_y + + output_size = image_conf.channels * output_x * output_y + + self.set_layer_height_width(output_y, output_x) + self.set_layer_depth(input_layer.depth) + self.set_layer_size(output_size) + + @config_layer('pad') class PadLayer(LayerBase): def __init__(self, name, inputs, **xargs): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 3684d1e8f7..ebc31b23e0 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -148,6 +148,7 @@ __all__ = [ 'resize_layer', 'sub_seq_layer', 'scale_sub_region_layer', + 'upsample_layer', 'factorization_machine', ] @@ -166,6 +167,7 @@ class LayerType(object): SEQUENCE_RESHAPE = 'seqreshape' POOLING_MAX = 'max' POOLING_AVG = 'average' + UPSAMPLE_LAYER = 'upsample' FC_LAYER = 'fc' COST = 'cost' COSINE_SIM_VEC = 'cos_vm' @@ -3014,6 +3016,83 @@ def img_pool3d_layer(input, size=l.config.size) +@wrap_name_default("upsample") +@layer_support() +def upsample_layer(input, + name=None, + scale=None, + scale_y=None, + upsample_size=None, + upsample_size_y=None, + pad_out_x=False, + pad_out_y=False, + layer_attr=None): + """ + The DePooling process. + Inputs should be a list of length 2. The first input is a layer, + and the second input should be the MaxWithMaskPoolingLayer + + The example usage is: + + .. code-block:: python + pool1 = paddle.v2.layer.img_pool(input=input, pool_size=2, stride=2, + pool_type=paddle.pooling.MaxWithMask()) + upsample = paddle.v2.layer.upsample(input=[layer1, pool1]) + + :param name: The name of this layer. It is optional. + :type name: basestring + :param input: contains an input layer and a MaxWithMaskPoolingLayer + :type input: list | tuple | collections.Sequence + :param scale: outputSize = scale * inputSize + :type scale: int | list | tuple | . + :param scale_y: scale_y will be equal to scale, if it's value is None, + :type scale: int | None. + :param upsample_size: specify the outputSize. + :type upsample_size: int | list | tuple. + :param upsample_size_y: specify the y dimension outputSize. + :type upsample_size_y: int. + :param pad_out_x: specify exact x dimension size. This parameter only works when scale is 2 + :type pad_out_x: bool. + :param pad_out_y: specify exact y dimension size. This parameter only works when scale is 2 + :type pad_out_y: bool. + :param layer_attr: Extra Layer Attribute. + :type layer_attr: ExtraLayerAttribute + :return: LayerOutput object. + :rtype: LayerOutput + """ + + assert (scale is not None) or (upsample_size is not None), \ + 'scale or upsample_size, there must be one to be designated' + + assert len(input) == 2, 'layer input size must be 2' + + assert input[1].layer_type == LayerType.POOL_LAYER, \ + 'the second input should be the MaxPoolWithMaskLayer' + + scale_y = scale \ + if scale is not None else scale_y + upsample_size_y = upsample_size \ + if upsample_size is not None else upsample_size_y + + layer_type = LayerType.UPSAMPLE_LAYER + + layer = Layer( + name=name, + type=layer_type, + inputs=[ + Input( + input[0].name, + upsample=Upsample(scale, scale_y, pad_out_x, pad_out_y, + upsample_size, upsample_size_y)), + Input(input[1].name) + ], + **ExtraLayerAttribute.to_kwargs(layer_attr)) + + sz = layer.config.size + + return LayerOutput(name, layer_type=layer_type, parents=input, size=sz) + + @wrap_name_default("spp") @layer_support() def spp_layer(input,