From a24691a2a9299e3ee3055aa309dc3d3749572aaa Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 31 Oct 2018 20:49:56 +0800 Subject: [PATCH] add nearest neighbor interpolation operator cpu kernel --- .../operators/nearest_neighbor_interp_op.cc | 115 ++++++++++ .../operators/nearest_neighbor_interp_op.cu | 210 ++++++++++++++++++ .../operators/nearest_neighbor_interp_op.h | 130 +++++++++++ 3 files changed, 455 insertions(+) create mode 100644 paddle/fluid/operators/nearest_neighbor_interp_op.cc create mode 100644 paddle/fluid/operators/nearest_neighbor_interp_op.cu create mode 100644 paddle/fluid/operators/nearest_neighbor_interp_op.h diff --git a/paddle/fluid/operators/nearest_neighbor_interp_op.cc b/paddle/fluid/operators/nearest_neighbor_interp_op.cc new file mode 100644 index 0000000000..4e29fe5ac3 --- /dev/null +++ b/paddle/fluid/operators/nearest_neighbor_interp_op.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/nearest_neighbor_interp_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class NearestNeighborInterpOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of BilinearInterOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of BilinearInterOp should not be null."); + + auto dim_x = ctx->GetInputDim("X"); // NCHW format + int out_h = ctx->Attrs().Get("out_h"); + int out_w = ctx->Attrs().Get("out_w"); + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4"); + + if (ctx->HasInput("OutSize")) { + auto out_size_dim = ctx->GetInputDim("OutSize"); + PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, + "OutSize's dimension size must be 1"); + PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2"); + } + std::vector dim_out({dim_x[0], dim_x[1], out_h, out_w}); + ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } +}; + +class NearestNeighborInterpOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of nearest neighbor interpolation, " + "This is a 4-D tensor with shape of (N x C x h x w)"); + AddInput("OutSize", + "This is a 1-D tensor with two number. " + "The first number is height and the second number is width.") + .AsDispensable(); + AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)"); + + AddAttr("out_h", "output height of bilinear interpolation op."); + AddAttr("out_w", "output width of bilinear interpolation op."); + AddComment(R"DOC( + Nearest neighbor interpolation is to perform nearest neighbor interpolation + in bot the 3rd dimention(in height direction) and the 4th dimention(in width + direction) on input tensor. + + For details, please refer to Wikipedia: + https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation + )DOC"); + } +}; + +class NearestNeighborInterpOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto dim_x = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(nearest_neighbor_interp, ops::NearestNeighborInterpOp, + ops::NearestNeighborInterpOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(nearest_neighbor_interp_grad, + ops::NearestNeighborInterpOpGrad); +REGISTER_OP_CPU_KERNEL(nearest_neighbor_interp, + ops::NearestNeighborInterpKernel, + ops::NearestNeighborInterpKernel); +REGISTER_OP_CPU_KERNEL(nearest_neighbor_interp_grad, + ops::NearestNeighborInterpGradKernel); diff --git a/paddle/fluid/operators/nearest_neighbor_interp_op.cu b/paddle/fluid/operators/nearest_neighbor_interp_op.cu new file mode 100644 index 0000000000..11002d2e1e --- /dev/null +++ b/paddle/fluid/operators/nearest_neighbor_interp_op.cu @@ -0,0 +1,210 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/nearest_neighbor_interp_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +using EigenTensor = framework::EigenTensor; +using framework::Tensor; + +template +__global__ void KeBilinearInterpFw( + const T* in, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const T ratio_h, const T ratioW) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < nthreads) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + int channel_id = out_id_w / out_img_size; + + int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int in_img_idy = ratio_h * out_img_idy; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T h1lambda = ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int out_img_idx = tid % out_img_w; + int in_img_idx = ratioW * out_img_idx; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T w1lambda = ratioW * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id]); + } +} + +template +__global__ void KeBilinearInterpBw( + T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, + const size_t input_w, const T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const T ratio_h, const T ratioW) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < nthreads) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + int channel_id = out_id_w / out_img_size; + + int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int in_img_idy = ratio_h * out_img_idy; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T h1lambda = ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int out_img_idx = tid % out_img_w; + int in_img_idx = ratioW * out_img_idx; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T w1lambda = ratioW * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + atomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + atomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); + atomicAdd(&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0]); + atomicAdd(&in_pos[h_id * in_img_w + w_id], + h1lambda * w1lambda * out_pos[0]); + } +} + +template +class NearestNeighborInterpOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto* input_t = ctx.Input("X"); // float tensor + auto* output_t = ctx.Output("Out"); // float tensor + auto* input = input_t->data(); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + auto out_dims = output_t->dims(); + auto out_size_t = ctx.Input("OutSize"); + if (out_size_t != nullptr) { + Tensor sizes; + framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + auto* output = output_t->mutable_data( + {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace()); + + int batch_size = input_t->dims()[0]; + int channels = input_t->dims()[1]; + int in_h = input_t->dims()[2]; + int in_w = input_t->dims()[3]; + + int in_hw = in_h * in_w; + int out_hw = out_h * out_w; + int in_chw = channels * in_hw; + int out_chw = channels * out_hw; + + T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + if (in_h == out_h && in_w == out_w) { + memcpy(output, input, input_t->numel() * sizeof(T)); + } else { + int threadNum = batch_size * out_chw; + int blocks = (threadNum + 1024 - 1) / 1024; + + KeBilinearInterpFw< + T><<>>( + input, in_h, in_w, batch_size, in_chw, output, out_h, out_w, + batch_size, out_chw, channels, ratio_h, ratio_w); + } + } +}; + +template +class NearestNeighborInterpGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_input_t = ctx.Output(framework::GradVarName("X")); + auto* d_output_t = ctx.Input(framework::GradVarName("Out")); + auto* d_output = d_output_t->data(); + auto* d_input = d_input_t->mutable_data(ctx.GetPlace()); + + auto& device_ctx = + ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, d_input_t, static_cast(0.0)); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + + auto out_size_t = ctx.Input("OutSize"); + if (out_size_t != nullptr) { + Tensor sizes; + framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + + int batch_size = d_input_t->dims()[0]; + int channels = d_input_t->dims()[1]; + int in_h = d_input_t->dims()[2]; + int in_w = d_input_t->dims()[3]; + + int in_hw = in_h * in_w; + int out_hw = out_h * out_w; + int in_chw = channels * in_hw; + int out_chw = channels * out_hw; + + T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + if (in_h == out_h && in_w == out_w) { + memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); + } else { + int threadNum = batch_size * out_chw; + int blocks = (threadNum + 1024 - 1) / 1024; + + KeBilinearInterpBw< + T><<>>( + d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w, + batch_size, out_chw, channels, ratio_h, ratio_w); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(nearest_neighbor_interp, + ops::NearestNeighborInterpOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(nearest_neighborinterp_grad, + ops::NearestNeighborInterpGradOpCUDAKernel); diff --git a/paddle/fluid/operators/nearest_neighbor_interp_op.h b/paddle/fluid/operators/nearest_neighbor_interp_op.h new file mode 100644 index 0000000000..5ba12eaa7c --- /dev/null +++ b/paddle/fluid/operators/nearest_neighbor_interp_op.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +using EigenTensor = framework::EigenTensor; +using Tensor = framework::Tensor; + +template +class NearestNeighborInterpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + auto out_size_data = out_size->data(); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + + const int in_n = input->dims()[0]; + const int in_c = input->dims()[1]; + const int in_h = input->dims()[2]; + const int in_w = input->dims()[3]; + + output->mutable_data({in_n, in_c, out_h, out_w}, ctx.GetPlace()); + auto& device_ctx = + ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, output, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + auto input_t = EigenTensor::From(*input); + auto output_t = EigenTensor::From(*output); + for (int k = 0; k < out_h; k++) { // loop for images + for (int l = 0; l < out_w; l++) { + int in_k = static_cast(round(ratio_h * k)); + int in_l = static_cast(round(ratio_w * l)); + for (int i = 0; i < in_n; i++) { // loop for batches + for (int j = 0; j < in_c; j++) { // loop for channels + output_t(i, j, k, l) = input_t(i, j, in_k, in_l); + } + } + } + } + } +}; + +template +class NearestNeighborInterpGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + auto out_size_data = out_size->data(); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + + const int in_n = input_grad->dims()[0]; + const int in_c = input_grad->dims()[1]; + const int in_h = input_grad->dims()[2]; + const int in_w = input_grad->dims()[3]; + + auto& device_ctx = + ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(*output_grad); + for (int k = 0; k < out_h; k++) { // loop for images + for (int l = 0; l < out_w; l++) { + int in_k = static_cast(round(ratio_h * k)); + int in_l = static_cast(round(ratio_w * l)); + for (int i = 0; i < in_n; i++) { // loop for batches + for (int j = 0; j < in_c; j++) { // loop for channels + input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle