Refine Split op (#13967)

* speedup split_op test=develop * speedup split_op test=develop * rename ConcatGrad to Split * refine concat and split test=develop * fix compile error
7 years ago · a7497653d0
parent 96e9b658bf
commit a7497653d0
14 changed files with 89 additions and 75 deletions
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -316,7 +316,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(concat_op DEPS concat)
+op_library(concat_op DEPS concat_and_split)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <paddle/fluid/operators/math/concat.h>
+#include <paddle/fluid/operators/math/concat_and_split.h>
 #include <numeric>
 #include "paddle/fluid/framework/lod_rank_table.h"
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@ -17,7 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 namespace paddle {
@ -89,28 +89,16 @@ class ConcatGradKernel : public framework::OpKernel<T> {
        outputs.push_back(nullptr);
      }
    }
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    // Sometimes direct copies will be faster, this maybe need deeply analysis.
    if (axis == 0 && outs.size() < 10) {
-      size_t input_offset = 0;
+      std::vector<const framework::Tensor*> ref_shape;
-      const auto in_stride = framework::stride_numel(out_grad->dims());
+      ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end());
-
+      StridedMemcpyWithAxis0<T>(dev_ctx, *out_grad, ref_shape, &outputs);
      for (size_t i = 0; i < outs.size(); ++i) {
        auto out_stride = framework::stride_numel(ins[i]->dims());
        auto* out = outputs[i];
        if (out != nullptr) {
          StridedNumelCopyWithAxis<T>(
              ctx.device_context(), axis, out->data<T>(), out_stride,
              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
        }
        input_offset += out_stride[axis];
      }
    } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      math::SplitFunctor<DeviceContext, T> split_functor;
-      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
+      split_functor(dev_ctx, *out_grad, ctx.MultiInput<framework::Tensor>("X"),
          concat_grad_functor;
      concat_grad_functor(dev_ctx, *out_grad,
                          ctx.MultiInput<framework::Tensor>("X"),
                    static_cast<int>(axis), &outputs);
    }
  }
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/port.h"
@ -79,7 +79,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
 template <typename DeviceContext>
 template <typename T>
 void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
-  math::ConcatGradFunctor<DeviceContext, T> func;
+  math::SplitFunctor<DeviceContext, T> func;
  func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
       &prev_functor_->outputs_);
 }
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@ -35,7 +35,7 @@ function(math_library TARGET)
 endfunction()
 # please add new math_library in alphabetical order
-math_library(concat)
+math_library(concat_and_split)
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
@ -72,7 +72,7 @@ if(WITH_GPU)
    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
-cc_test(concat_test SRCS concat_test.cc DEPS concat)
+cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 cc_library(jit_kernel 
    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include <vector>
 namespace paddle {
@ -67,7 +67,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
 * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
-class ConcatGradFunctor<platform::CPUDeviceContext, T> {
+class SplitFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input,
@ -111,7 +111,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
 };
 #define DEFINE_FUNCTOR(type)                                      \
  template class ConcatFunctor<platform::CPUDeviceContext, type>; \
-  template class ConcatGradFunctor<platform::CPUDeviceContext, type>;
+  template class SplitFunctor<platform::CPUDeviceContext, type>;
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
@ -24,7 +24,7 @@ namespace operators {
 namespace math {
 template <typename T>
-__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
+__global__ void ConcatKernel(T** inputs, const int* input_cols, int col_size,
                             const int output_rows, const int output_cols,
                             T* output) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@ -50,7 +50,7 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }
 template <typename T>
-__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
+__global__ void ConcatKernel(T** inputs_data, const int fixed_in_col,
                             const int out_rows, const int out_cols,
                             T* output_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@ -67,7 +67,7 @@ __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
 }
 template <typename T>
-__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+__global__ void SplitKernel(const T* input_data, const int in_row,
                            const int in_col, const int* out_cols,
                            int out_cols_size, T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@ -94,7 +94,7 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
 }
 template <typename T>
-__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+__global__ void SplitKernel(const T* input_data, const int in_row,
                            const int in_col, const int fixed_out_col,
                            T** outputs_data) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@ -170,11 +170,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
    if (sameShape) {
-      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
          dev_ins_data, in_col, out_row, out_col, output->data<T>());
    } else {
      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
-      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
          out_row, out_col, output->data<T>());
    }
@ -189,7 +189,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
 * each dimension must be the same, except the axis dimension.
 */
 template <typename T>
-class ConcatGradFunctor<platform::CUDADeviceContext, T> {
+class SplitFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input,
@ -248,11 +248,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
    if (sameShape) {
-      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
    } else {
      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
-      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
          input.data<T>(), in_row, in_col, dev_outs_col_data,
          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
    }
@ -264,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 #define DEFINE_FUNCTOR(type)                                       \
  template class ConcatFunctor<platform::CUDADeviceContext, type>; \
-  template class ConcatGradFunctor<platform::CUDADeviceContext, type>
+  template class SplitFunctor<platform::CUDADeviceContext, type>
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@ -54,7 +54,7 @@ class ConcatFunctor {
 *     Output[1] = [[5,6]]
 */
 template <typename DeviceContext, typename T>
-class ConcatGradFunctor {
+class SplitFunctor {
 public:
  void operator()(const DeviceContext& context, const framework::Tensor& input,
                  const std::vector<const framework::Tensor*>& ref_inputs,
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/concat.h"
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 template <typename DeviceContext, typename Place>
 void testConcat() {
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@ -17,7 +17,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 namespace paddle {
 namespace operators {
@ -106,7 +106,7 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
      }
    }
-    math::ConcatGradFunctor<DeviceContext, T> functor;
+    math::SplitFunctor<DeviceContext, T> functor;
    std::vector<const framework::Tensor *> sliced_x_ptr;
    std::vector<framework::Tensor *> sliced_dx_ptr;
    for (auto &x : sliced_x) {
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@ -111,11 +111,10 @@ Example:
 }  // namespace paddle
 namespace ops = paddle::operators;
 USE_CPU_ONLY_OP(concat);
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
-REGISTER_OP_CPU_KERNEL(split,
+REGISTER_OP_CPU_KERNEL(
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
+    split, ops::SplitOpKernel<paddle::platform::CPUDeviceContext, double>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, float>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int>);
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@ -17,6 +17,7 @@ limitations under the License. */
 #include <chrono>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 namespace paddle {
@ -28,18 +29,22 @@ class SplitOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<framework::Tensor>("X");
    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto in_stride = framework::stride_numel(in->dims());
+    int axis = ctx.Attr<int>("axis");
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
    auto place = ctx.GetPlace();
-    size_t input_offset = 0;
+    std::vector<const framework::Tensor*> shape_refer;
-    for (auto& out : outs) {
+    for (size_t j = 0; j < outs.size(); ++j) {
-      out->mutable_data<T>(ctx.GetPlace());
+      outs[j]->mutable_data<T>(ctx.GetPlace());
-      auto out_stride = framework::stride_numel(out->dims());
+      shape_refer.emplace_back(outs[j]);
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+    }
-                                  out_stride, in->data<T>() + input_offset,
+
-                                  in_stride, out_stride[axis]);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      input_offset += out_stride[axis];
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
    if (axis == 0 && outs.size() < 10) {
      StridedMemcpyWithAxis0<T>(dev_ctx, *in, shape_refer, &outs);
    } else {
      math::SplitFunctor<DeviceContext, T> functor;
      functor(dev_ctx, *in, shape_refer, axis, &outs);
    }
  }
 };
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/detail/strided_memcpy.h"
 namespace paddle {
 namespace operators {
@ -98,5 +99,26 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
  }
 }
 template <typename T>
 inline void StridedMemcpyWithAxis0(
    const platform::DeviceContext& dev_ctx, const framework::Tensor& input,
    const std::vector<const framework::Tensor*>& shape_refer,
    std::vector<framework::Tensor*>* outputs) {
  const framework::DDim in_stride = stride_numel(input.dims());
  const int axis = 0;
  size_t input_offset = 0;
  for (size_t i = 0; i < outputs->size(); ++i) {
    auto out_stride = stride_numel(shape_refer[i]->dims());
    auto out = outputs->at(i);
    if (out != nullptr) {
      StridedNumelCopyWithAxis<T>(dev_ctx, axis, out->data<T>(), out_stride,
                                  input.data<T>() + input_offset, in_stride,
                                  out_stride[axis]);
    }
    input_offset += out_stride[axis];
  }
 }
 }  // namespace operators
 }  // namespace paddle