remove conflict

7 years ago · 5ccab2dc65
parent bbff442eee 1dceb99e86
commit 5ccab2dc65
54 changed files with 934 additions and 610 deletions
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -2,7 +2,7 @@
 |---|---|
 | backyes | Yan-Fei Wang |
 | beckett1124 | Bin Qi |
-| Canpio | Jia-Yi Feng |
+| JiayiFeng | Jia-Yi Feng |
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
--- a/doc/design/kernel_selection.md
+++ b/doc/design/kernel_selection.md
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@ -82,7 +82,7 @@ language = 'zh_CN'

 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_en*', '*_en*']
+exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']

 # The reST default role (used for this markup: `text`) to use for all
 # documents.
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@ -82,7 +82,7 @@ language = None

 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']

 # The reST default role (used for this markup: `text`) to use for all
 # documents.
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -11,7 +11,6 @@ if(MOBILE_INFERENCE)
 else()
  add_subdirectory(pserver)
  add_subdirectory(trainer)
-  add_subdirectory(string)
  add_subdirectory(scripts)

  if(WITH_C_API)
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -4,3 +4,4 @@ add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(inference)
+add_subdirectory(string)
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@ -314,5 +314,15 @@ DDim stride(const DDim& ddim) {
  }
  return framework::make_ddim(strides);
 }
+
+DDim stride_numel(const framework::DDim& ddim) {
+  std::vector<int64_t> strides(ddim.size());
+  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i];
+  }
+  return framework::make_ddim(strides);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@ -125,6 +125,8 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims);
 DDim flatten_to_1d(const DDim& src);

 DDim stride(const DDim& ddim);
+
+DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/string/piece.h"
+#include "paddle/fluid/string/piece.h"

 namespace paddle {
 namespace framework {
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@ -37,9 +37,8 @@ class Vector {

  // Fill vector with value. The vector size is `count`.
  explicit Vector(size_t count, const T& value = T()) {
-    if (count == 0) {
-      InitEmpty();
-    } else {
+    InitEmpty();
+    if (count != 0) {
      resize(count);
      T* ptr = begin();
      for (size_t i = 0; i < count; ++i) {
@ -122,6 +121,10 @@ class Vector {
  const T* begin() const { return &this->operator[](0); }
  const T* end() const { return &this->operator[](size()); }

+  const T* cbegin() const { return begin(); }
+
+  const T* cend() const { return end(); }
+
  const T& back() const {
    auto it = end();
    --it;
@ -244,7 +247,9 @@ class Vector {

  bool operator==(const Vector<T>& other) const {
    if (size() != other.size()) return false;
-    for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
      if (*it1 != *it2) {
        return false;
      }
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@ -26,10 +26,10 @@ TEST(mixed_vector, CPU_VECTOR) {
  for (int i = 0; i < 10; ++i) {
    tmp.push_back(i);
  }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
  vec<int> tmp2;
  tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10);
+  ASSERT_EQ(tmp2.size(), 10UL);
  for (int i = 0; i < 10; ++i) {
    ASSERT_EQ(tmp2[i], i);
    ASSERT_EQ(tmp2[i], tmp[i]);
@ -58,7 +58,7 @@ TEST(mixed_vector, GPU_VECTOR) {
  for (int i = 0; i < 10; ++i) {
    tmp.push_back(i);
  }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu(0);

  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
@ -79,7 +79,7 @@ TEST(mixed_vector, MultiGPU) {
  for (int i = 0; i < 10; ++i) {
    tmp.push_back(i);
  }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu0(0);
  paddle::platform::SetDeviceId(0);
  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
@ -91,3 +91,10 @@ TEST(mixed_vector, MultiGPU) {
    ASSERT_EQ(tmp[i], i * 100);
  }
 }
+
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -18,7 +18,7 @@ limitations under the License. */
 #include <mutex>   // for call_once
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"

 DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@ -28,17 +28,18 @@ class ConcatKernel : public framework::OpKernel<T> {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
    auto* out = ctx.Output<framework::Tensor>("Out");
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = ins.size();
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    auto out_stride = framework::stride_numel(out->dims());
+
    size_t output_offset = 0;
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_stride = framework::stride(out->dims());
-    for (size_t i = 0; i < n; i++) {
-      auto& in = ins[i];
-      auto axis_dim = in->dims()[axis];
-      auto in_stride = framework::stride(in->dims());
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>(), in_stride,
-                       in->dims(), out_stride, out->data<T>() + output_offset);
-      output_offset += axis_dim * in_stride[axis];
+    for (auto* in : ins) {
+      auto in_stride = framework::stride_numel(in->dims());
+      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
+                                  out->data<T>() + output_offset, out_stride,
+                                  in->data<T>(), in_stride);
+      output_offset += in_stride[axis];
    }
  }
 };
@ -50,17 +51,16 @@ class ConcatGradKernel : public framework::OpKernel<T> {
    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = outs.size();
    size_t input_offset = 0;
-    auto in_stride = framework::stride(in->dims());
-    for (size_t i = 0; i < n; i++) {
-      auto& out = outs[i];
+    auto in_stride = framework::stride_numel(in->dims());
+
+    for (auto& out : outs) {
      out->mutable_data<T>(ctx.GetPlace());
-      size_t axis_dim = out->dims()[axis];
-      auto out_stride = framework::stride(out->dims());
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
-                       in_stride, out->dims(), out_stride, out->data<T>());
-      input_offset += axis_dim * in_stride[axis];
+      auto out_stride = framework::stride_numel(out->dims());
+      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+                                  out_stride, in->data<T>() + input_offset,
+                                  in_stride);
+      input_offset += out_stride[axis];
    }
  }
 };
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_server.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"

 namespace paddle {
 namespace operators {
@ -101,11 +101,15 @@ class ListenAndServOp : public framework::OperatorBase {

    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
    bool exit_flag = false;
+    // Record received sparse variables, so that
+    // we could reset those after execute optimize program
+    std::vector<framework::Variable *> sparse_vars;
    while (!exit_flag) {
      // Get from multiple trainers, we don't care about the order in which
      // the gradients arrives, just add suffix 0~n and merge the gradient.
      rpc_service_->SetCond(0);
      size_t recv_var_cnt = 0;
+      size_t update_param_cnt = 0;
      int batch_barrier = 0;
      while (batch_barrier != fan_in) {
        const detail::MessageWithName &v = rpc_service_->Get();
@ -126,13 +130,14 @@ class ListenAndServOp : public framework::OperatorBase {
          std::string param_var_name;
          if (it != grad_list.end()) {
            param_var_name = param_list[it - grad_list.begin()];
+            update_param_cnt++;
+            VLOG(3) << "received grad: " << grad_var_name
+                    << " updating param: " << param_var_name;
          } else {
-            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
+            VLOG(3) << "received variable: " << grad_var_name
+                    << " no need to update param";
          }
-          VLOG(3) << "received grad: " << grad_var_name
-                  << " updating param: " << param_var_name;
-
-          if (fan_in > 1) {
+          if (fan_in > 1 && !param_var_name.empty()) {
            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
          }
          auto *var = recv_scope.FindVar(grad_var_name);
@ -141,23 +146,35 @@ class ListenAndServOp : public framework::OperatorBase {
            PADDLE_THROW("Can not find server side var");
          }
          detail::DeserializeFromMessage(v.second, dev_ctx, var);
+          if (var->IsType<framework::SelectedRows>()) {
+            sparse_vars.push_back(var);
+          }
        }
      }
      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
-      // TODO(Yancey1989): merge SelectedRows variables here
      if (exit_flag) {
        rpc_service_->ShutDown();
      }
-
+      VLOG(3) << "run optimize graph...";
      try {
        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
                     false /*create_local_scope*/, false /*create_vars*/);
      } catch (std::exception &e) {
        LOG(ERROR) << "run sub program error " << e.what();
      }
+
+      // Reset the received sparse variables, the sum operator would not
+      // sum the input sparse variables which rows is empty at the next
+      // mini-batch.
+      // TOOD(Yancey1989): move the reset action into an operator, we couldn't
+      // have any hide logic in the operator.
+      for (auto &var : sparse_vars) {
+        var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+      }
      rpc_service_->SetCond(1);
-      rpc_service_->WaitClientGet(recv_var_cnt);
+      rpc_service_->WaitClientGet(update_param_cnt);
      grads_counter_.clear();
+      sparse_vars.clear();
    }  // while(true)
  }

--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@ -38,22 +38,22 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
    auto box_dims = ctx->GetInputDim("BBoxes");
    auto score_dims = ctx->GetInputDim("Scores");

-    PADDLE_ENFORCE_EQ(box_dims.size(), 2,
-                      "The rank of Input(BBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(box_dims.size(), 3,
+                      "The rank of Input(BBoxes) must be 3.");
    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
                      "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[1], 4,
+    PADDLE_ENFORCE_EQ(box_dims[2], 4,
                      "The 2nd dimension of Input(BBoxes) must be 4, "
                      "represents the layout of coordinate "
                      "[xmin, ymin, xmax, ymax]");
-    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
+    PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
                      "The 1st dimensiong of Input(BBoxes) must be equal to "
                      "3rd dimension of Input(Scores), which represents the "
                      "predicted bboxes.");

    // Here the box_dims[0] is not the real dimension of output.
    // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {box_dims[0], 6});
+    ctx->SetOutputDim("Out", {box_dims[1], 6});
  }

 protected:
@ -260,15 +260,20 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
    int64_t batch_size = score_dims[0];
    int64_t class_num = score_dims[1];
    int64_t predict_dim = score_dims[2];
+    int64_t box_dim = boxes->dims()[2];

    std::vector<std::map<int, std::vector<int>>> all_indices;
    std::vector<size_t> batch_starts = {0};
    for (int64_t i = 0; i < batch_size; ++i) {
      Tensor ins_score = scores->Slice(i, i + 1);
      ins_score.Resize({class_num, predict_dim});
+
+      Tensor ins_boxes = boxes->Slice(i, i + 1);
+      ins_boxes.Resize({predict_dim, box_dim});
+
      std::map<int, std::vector<int>> indices;
      int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
+      MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out);
      all_indices.push_back(indices);
      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
    }
@ -282,11 +287,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
      for (int64_t i = 0; i < batch_size; ++i) {
        Tensor ins_score = scores->Slice(i, i + 1);
        ins_score.Resize({class_num, predict_dim});
+
+        Tensor ins_boxes = boxes->Slice(i, i + 1);
+        ins_boxes.Resize({predict_dim, box_dim});
+
        int64_t s = batch_starts[i];
        int64_t e = batch_starts[i + 1];
        if (e > s) {
          Tensor out = outs->Slice(s, e);
-          MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
+          MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out);
        }
      }
    }
@ -303,9 +312,9 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("BBoxes",
-             "(Tensor) A 2-D Tensor with shape [M, 4] represents the "
-             "predicted locations of M bounding bboxes. Each bounding box "
-             "has four coordinate values and the layout is "
+             "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the "
+             "predicted locations of M bounding bboxes, N is the batch size. "
+             "Each bounding box has four coordinate values and the layout is "
             "[xmin, ymin, xmax, ymax].");
    AddInput("Scores",
             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@ -24,6 +24,22 @@ limitations under the License. */

 namespace paddle {
 namespace operators {
+static bool IsVariableInitialized(const framework::Scope& scope,
+                                  const std::string& varname) {
+  auto* var = scope.FindVar(varname);
+  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
+                          varname);
+  if (var->IsType<framework::LoDTensor>()) {
+    return var->Get<framework::LoDTensor>().IsInitialized();
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return var->Get<framework::SelectedRows>().value().IsInitialized();
+  } else {
+    PADDLE_THROW(
+        "Variable type in send side should be in "
+        "[LodTensor, SelectedRows]");
+  }
+  return false;
+}

 class SendOp : public framework::OperatorBase {
 public:
@ -51,8 +67,12 @@ class SendOp : public framework::OperatorBase {
    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();

    for (size_t i = 0; i < ins.size(); i++) {
-      VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-      rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      if (IsVariableInitialized(scope, ins[i])) {
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      } else {
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+      }
    }
    PADDLE_ENFORCE(rpc_client->Wait());

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"

 USE_NO_KERNEL_OP(send);
 USE_NO_KERNEL_OP(listen_and_serv);
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@ -29,7 +29,9 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("Out"));
    PADDLE_ENFORCE(ctx->HasInput("Y"));
    framework::DDim out_dim;
-    out_dim = ctx->GetInputDim("Y");
+    auto y_dim = ctx->GetInputDim("Y");
+    out_dim = ctx->GetInputDim("X");
+    out_dim[0] = y_dim[0];
    ctx->ShareLoD("Y", "Out");
    ctx->SetOutputDim("Out", out_dim);
  }
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <chrono>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
@ -27,18 +28,18 @@ class SplitOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<framework::Tensor>("X");
    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto in_stride = framework::stride(in->dims());
+    auto in_stride = framework::stride_numel(in->dims());
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = outs.size();
+    auto place = ctx.GetPlace();
+
    size_t input_offset = 0;
-    for (size_t i = 0; i < n; i++) {
-      auto& out = outs[i];
+    for (auto& out : outs) {
      out->mutable_data<T>(ctx.GetPlace());
-      size_t axis_dim = out->dims()[axis];
-      auto out_stride = framework::stride(out->dims());
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
-                       in_stride, out->dims(), out_stride, out->data<T>());
-      input_offset += axis_dim * in_stride[axis];
+      auto out_stride = framework::stride_numel(out->dims());
+      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+                                  out_stride, in->data<T>() + input_offset,
+                                  in_stride);
+      input_offset += out_stride[axis];
    }
  }
 };
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@ -22,7 +22,7 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input SelectedRows.");
-    AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
+    AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
    AddAttr<std::vector<int>>("height_sections",
                              "Height for each output SelectedRows.")
        .SetDefault(std::vector<int>({}));
@ -56,27 +56,6 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
                   "SplitSelectedRowsOp must has output Out.");
-
-    std::vector<int> height_sections =
-        ctx->Attrs().Get<std::vector<int>>("height_sections");
-    int64_t n = ctx->Outputs("Out").size();
-
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(n);
-
-    // make output dims
-    for (int64_t i = 0; i < n; ++i) {
-      auto dims = ctx->GetInputDim("X");
-      if (height_sections.size()) {
-        PADDLE_ENFORCE_EQ(
-            height_sections.size(), static_cast<size_t>(n),
-            "The size of height section should be the same with height"
-            " section size.");
-        dims[0] = height_sections[i];
-      }
-      outs_dims.push_back(dims);
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
  }
 };

--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@ -55,6 +55,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {

    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
      auto rows_idx = outs_rows_idx[i];
+      outs[i]->set_height(height_sections[i]);
      if (rows_idx.size() > 0) {
        auto dims = x->GetCompleteDims();
        dims[0] = rows_idx.size();
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@ -41,5 +41,62 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
  boost::apply_visitor(func, dst_dim);
 }
+
+// Strided numel memory copy from src to dst by the specified axis
+//
+// For example, for a tensor dims [4, 20, 100], the strieded numel is
+// [8000, 2000, 100]
+//
+// NOTE: The src and dst tensor should have the same elements
+// except the specified axis.
+template <typename T>
+inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
+                                     int64_t axis, T* dst,
+                                     const framework::DDim& dst_stride_numel,
+                                     const T* src,
+                                     const framework::DDim& src_stride_numel) {
+  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
+  int64_t src_after = src_stride_numel[axis];
+  int64_t dst_after = dst_stride_numel[axis];
+  auto place = ctx.GetPlace();
+
+  PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
+                    "src and dst tensor should have the same dims size.");
+
+  for (int64_t i = 0; i < axis; ++i) {
+    if (i < axis) {
+      PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
+                        dst_stride_numel[i] / dst_stride_numel[axis],
+                        "src and dst should have the same elements "
+                        "except the specified axis.");
+    } else if (i == axis) {
+      continue;
+    } else {
+      PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
+                        "src and dst should have the same elements "
+                        "except the specified axis.");
+    }
+  }
+
+  for (int64_t i = 0; i < before; ++i) {
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
+                   src + i * src_after, sizeof(T) * src_after);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
+      auto& cuda_ctx =
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
+      memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
+                   src + i * src_after, sizeof(T) * src_after,
+                   cuda_ctx.stream());
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@ -116,7 +116,9 @@ class SumKernel : public framework::OpKernel<T> {
      int64_t offset = 0;
      for (int i = 0; i < N; i++) {
        auto &sel_row = get_selected_row(i);
-
+        if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) {
+          continue;
+        }
        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
        functor(context.template device_context<DeviceContext>(), sel_row,
                offset, out);
--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/target_assign_op.cc
--- a/paddle/fluid/operators/target_assign_op.cu
+++ b/paddle/fluid/operators/target_assign_op.cu
@ -17,39 +17,41 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename WT>
 __global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
-                                      const int num, const int num_prior_box,
-                                      const int background_label,
-                                      int* out_label, T* out_label_wt) {
+                                      const int N, const int M, const int K,
+                                      const int mismatch_value, T* out,
+                                      WT* out_wt) {
  int bidx = blockIdx.x;
  int st = lod[bidx];
  int ed = lod[bidx + 1];

-  int row_start = bidx * num_prior_box;
+  int row_start = bidx * M;
  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
    int id = row_start + neg_indices[i];
-    out_label[id] = background_label;
-    out_label_wt[id] = 1.;
+    for (int k = 0; k < K; ++k) {
+      out[id * K + k] = T(mismatch_value);
+      out_wt[id * K + k] = WT(1.);
+    }
  }
 }

-template <typename T>
-struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
+template <typename T, typename WT>
+struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
  void operator()(const platform::CUDADeviceContext& ctx,
-                  const int* neg_indices, const size_t* lod, const int num,
-                  const int num_prior_box, const int background_label,
-                  int* out_label, T* out_label_wt) {
+                  const int* neg_indices, const size_t* lod, const int N,
+                  const int M, const int K, const int mismatch_value, T* out,
+                  WT* out_wt) {
    const int block_size = 256;
-    const int grid_size = num;
-    NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
-        neg_indices, lod, num, num_prior_box, background_label, out_label,
-        out_label_wt);
+    const int grid_size = N;
+    NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
+        neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
  }
 };

-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float,
+                                       float>;

 }  // namespace operators
 }  // namespace paddle
@ -57,5 +59,5 @@ template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);
--- a/Show More
+++ b/Show More