Fix sparse update memory error for distributed training (#8837)

Fix sparse update memory error for distributed training
7 years ago · 8468037918
parent 124b75012e
commit 8468037918
7 changed files with 72 additions and 27 deletions
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@ -24,15 +24,15 @@ limitations under the License. */

 namespace paddle {
 namespace operators {
-static bool IsVariableInitialized(const framework::Scope& scope,
-                                  const std::string& varname) {
+static bool NeedSend(const framework::Scope& scope,
+                     const std::string& varname) {
  auto* var = scope.FindVar(varname);
  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                          varname);
  if (var->IsType<framework::LoDTensor>()) {
    return var->Get<framework::LoDTensor>().IsInitialized();
  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().value().IsInitialized();
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
  } else {
    PADDLE_THROW(
        "Variable type in send side should be in "
@ -67,7 +67,7 @@ class SendOp : public framework::OperatorBase {
    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();

    for (size_t i = 0; i < ins.size(); i++) {
-      if (IsVariableInitialized(scope, ins[i])) {
+      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
      } else {
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@ -39,6 +39,14 @@ class SGDOp : public framework::OperatorWithKernel {
    // and run time.
    ctx->SetOutputDim("ParamOut", param_dim);
  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
+        ctx.GetPlace());
+  }
 };

 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@ -47,6 +47,12 @@ class SGDOpKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(param, param_out);
      auto* grad = ctx.Input<framework::SelectedRows>("Grad");

+      // for distributed training, a sparse var may be empty,
+      // just skip updating.
+      if (grad->rows().size() == 0) {
+        return;
+      }
+
      auto in_height = grad->height();
      auto out_dims = param_out->dims();
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
@ -60,13 +66,15 @@ class SGDOpKernel : public framework::OpKernel<T> {
      auto* in_data = in_value.data<T>();
      auto* out_data = param_out->data<T>();
      auto* lr = learning_rate->data<T>();
-
      for (size_t i = 0; i < in_rows.size(); i++) {
+        PADDLE_ENFORCE(in_rows[i] < in_height,
+                       "Input rows index should less than height");
        for (int64_t j = 0; j < in_row_numel; j++) {
          out_data[in_rows[i] * in_row_numel + j] -=
              lr[0] * in_data[i * in_row_numel + j];
        }
      }
+
    } else {
      PADDLE_THROW("Unsupported Variable Type of Grad");
    }
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@ -21,15 +21,24 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-static int FindOutIdx(int row, const std::vector<int>& height_sections) {
-  int offset = 0;
-  for (size_t i = 0; i < height_sections.size(); ++i) {
-    if (row >= offset && row < (offset + height_sections[i])) {
-      return i;
+static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (row < abs_sections[i]) {
+      return i - 1;
    }
-    offset += height_sections[i];
  }
-  return -1;
+  return abs_sections.size() - 1;
+}
+
+static std::vector<int> ToAbsoluteSection(
+    const std::vector<int>& height_sections) {
+  std::vector<int> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
 }

 template <typename DeviceContext, typename T>
@ -40,16 +49,23 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");

+    auto abs_sections = ToAbsoluteSection(height_sections);
+
    auto x_rows = x->rows();
    std::vector<std::vector<int>> outs_rows_idx;
+    std::vector<std::vector<int>> outs_dense_idx;
+
    outs_rows_idx.resize(outs.size());
+    outs_dense_idx.resize(outs.size());

    auto row_numel = x->value().numel() / x->value().dims()[0];
    auto src = x->value().data<T>();

+    // split rows index into output sparse vars
    for (size_t i = 0; i < x_rows.size(); ++i) {
-      int out_idx = FindOutIdx(x_rows[i], height_sections);
-      outs_rows_idx[out_idx].push_back(i);
+      int out_idx = FindOutIdx(x_rows[i], abs_sections);
+      outs_rows_idx[out_idx].push_back(x_rows[i]);
+      outs_dense_idx[out_idx].push_back(i);
    }
    auto place = ctx.GetPlace();

@ -61,19 +77,20 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
        dims[0] = rows_idx.size();
        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
        for (auto idx : rows_idx) {
-          outs[i]->mutable_rows()->push_back(x_rows[idx]);
+          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
        }
        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
        for (size_t j = 0; j < rows_idx.size(); j++) {
          if (platform::is_cpu_place(place)) {
-            memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                         platform::CPUPlace(), src + rows_idx[j] * row_numel,
-                         sizeof(T) * row_numel);
+            memory::Copy(
+                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
+                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
          } else {
 #ifdef PADDLE_WITH_CUDA
            auto stream = ctx.cuda_device_context().stream();
            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(), src + rows_idx[j] * row_numel,
+                         platform::CUDAPlace(),
+                         src + outs_dense_idx[i][j] * row_numel,
                         sizeof(T) * row_numel, stream);
 #else
            PADDLE_THROW("Paddle is not compiled with GPU");
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@ -76,10 +76,16 @@ class SumOp : public framework::OperatorWithKernel {
          static_cast<framework::proto::VarType::Type>(dtype),
          ctx.device_context());
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
-      return framework::OpKernelType(
-          framework::ToDataType(
-              x_vars[0]->Get<framework::SelectedRows>().value().type()),
-          ctx.device_context());
+      for (auto& var : x_vars) {
+        auto& value = var->Get<framework::SelectedRows>().value();
+        if (value.IsInitialized()) {
+          return framework::OpKernelType(framework::ToDataType(value.type()),
+                                         ctx.device_context());
+        }
+      }
+      // if input sparse vars are not initialized, use an default kernel type.
+      return framework::OpKernelType(framework::proto::VarType::FP32,
+                                     ctx.device_context());
    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
      for (auto& x_var : x_vars) {
        auto& array = x_var->Get<framework::LoDTensorArray>();
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@ -109,6 +109,12 @@ class SumKernel : public framework::OpKernel<T> {
      in_dim[0] = static_cast<int64_t>(first_dim);

      out_value->Resize(framework::make_ddim(in_dim));
+
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
      out_value->mutable_data<T>(context.GetPlace());

      math::SelectedRowsAddTo<DeviceContext, T> functor;
@ -116,7 +122,7 @@ class SumKernel : public framework::OpKernel<T> {
      int64_t offset = 0;
      for (int i = 0; i < N; i++) {
        auto &sel_row = get_selected_row(i);
-        if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) {
+        if (sel_row.rows().size() == 0) {
          continue;
        }
        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@ -60,8 +60,8 @@ class TestSpliteSelectedRows(unittest.TestCase):

        # expected output selected rows
        expected_out0_rows = [0, 4]
-        expected_out1_rows = [5, 7]
-        expected_out4_rows = [20]
+        expected_out1_rows = [0, 2]
+        expected_out4_rows = [0]

        op = Operator(
            "split_selected_rows",
@ -101,7 +101,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
        out0_grad_tensor.set(np_array, place)

        out1_grad = scope.var("out1@GRAD").get_selected_rows()
-        rows1 = [7, 5]
+        rows1 = [2, 0]
        out1_grad.set_rows(rows1)
        out1_grad.set_height(height)
        out1_grad_tensor = out1_grad.get_tensor()