Paddle/paddle/fluid/operators/distributed/parameter_prefetch.cc

//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <set>
#include <string>
#include <vector>

#include "paddle/fluid/operators/distributed/parameter_prefetch.h"

#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"

#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"

namespace paddle {
namespace operators {
namespace distributed {

using LoDTensor = framework::LoDTensor;
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim;

static size_t GetSectionIndex(int64_t id,
                              const std::vector<int64_t>& abs_sections) {
  for (size_t i = 1; i < abs_sections.size(); ++i) {
    if (id < abs_sections[i]) {
      return i - 1;
    }
  }
  return abs_sections.size() - 1;
}

static std::vector<int64_t> ToAbsoluteSection(
    const std::vector<int>& height_sections) {
  std::vector<int64_t> abs_sections;
  abs_sections.resize(height_sections.size());
  abs_sections[0] = 0;
  for (size_t i = 1; i < height_sections.size(); ++i) {
    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
  }
  return abs_sections;
}

static std::vector<std::vector<int64_t>> SplitIds(
    const std::vector<int64_t>& ids_vector,
    const std::vector<int>& height_section, framework::Scope* scope) {
  std::set<int64_t> all_ids;
  for (auto id : ids_vector) {
    all_ids.insert(id);
  }

  auto abs_sections = ToAbsoluteSection(height_section);
  std::vector<std::vector<int64_t>> splited_ids;
  splited_ids.resize(height_section.size() + 1);
  for (auto& id : all_ids) {
    auto section_index = GetSectionIndex(id, abs_sections);
    splited_ids[section_index].push_back(id - abs_sections[section_index]);
  }
  return splited_ids;
}

static void SplitIdsIntoMultipleVarsBySection(
    const std::vector<std::string>& in_var_names,
    const std::vector<int>& height_section,
    const std::vector<std::vector<int64_t>>& splited_ids,
    framework::Scope* scope) {
  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");

  auto place = platform::CPUPlace();

  for (size_t i = 0; i < in_var_names.size(); ++i) {
    auto* id_tensor =
        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
    auto& ids = splited_ids[i];
    if (!ids.empty()) {
      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
    }
  }
}

static void MergeMultipleVarsIntoOneBySection(
    const std::string& id_name, const std::vector<int64_t>& ids_vector,
    const std::string& out_name, const std::vector<std::string>& out_var_names,
    const std::vector<int>& height_section,
    const std::vector<std::vector<int64_t>>& splited_ids,
    const framework::ExecutionContext& context, framework::Scope* scope,
    platform::DeviceContext* actual_ctx) {
  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");

  auto cpu_place = platform::CPUPlace();

  auto abs_sections = ToAbsoluteSection(height_section);
  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
  for (size_t i = 0; i < ids_vector.size(); ++i) {
    id_to_offset[ids_vector[i]].push_back(i);
  }

  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
  auto* out_tensor =
      scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();

  PADDLE_ENFORCE_GT(
      out_tensor->numel(), 0,
      "When calling this method, the LoDTensor's numel must larger than zero. "
      "Please check LoDTensor::Resize has been called first.");

  auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());

  bool is_on_cpu_place = true;
  if (!platform::is_cpu_place(id_tensor.place())) {
    is_on_cpu_place = false;
  }

  for (size_t section_idx = 0; section_idx < out_var_names.size();
       ++section_idx) {
    auto& ids_in_this_section = splited_ids[section_idx];
    if (!ids_in_this_section.empty()) {
      auto& prefetch_out_var =
          scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
      const auto* out_var_data = prefetch_out_var.data<float>();
      auto& dims = prefetch_out_var.dims();

      PADDLE_ENFORCE_EQ(dims.size(), 2, "");
      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);

      auto row_numel = dims[1];

      for (int64_t i = 0; i < dims[0]; ++i) {
        auto id = ids_in_this_section[i];
        auto origin_id = id + abs_sections[section_idx];
        auto& offsets = id_to_offset[origin_id];
        for (auto& offset : offsets) {
          // should support GPU tensor
          if (is_on_cpu_place) {
            memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
                         cpu_place, out_var_data + i * row_numel,
                         sizeof(float) * row_numel);
          } else {
#ifndef PADDLE_WITH_CUDA
            PADDLE_THROW("paddle is not compiled with CUDA!");
#else
            auto stream =
                static_cast<platform::CUDADeviceContext*>(actual_ctx)->stream();
            memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),
                         out_tensor_data + offset * row_numel, cpu_place,
                         out_var_data + i * row_numel,
                         sizeof(float) * row_numel, stream);
#endif
          }
        }
      }
    } else {
      VLOG(3) << "ids in this section is empty";
    }
  }
}

void prefetch(const std::string& id_name, const std::string& out_name,
              const std::vector<std::string>& table_names,
              const std::vector<std::string>& epmap,
              const std::vector<int>& height_sections,
              const framework::ExecutionContext& context,
              const framework::Scope& scope) {
  auto& local_scope = scope.NewScope();

  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
  auto& actual_ctx = *pool.Get(context.GetPlace());

  distributed::RPCClient* rpc_client =
      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
          context.Attr<int>("trainer_id"));

  std::vector<std::string> in_var_names;
  std::vector<std::string> out_var_names;
  for (size_t i = 0; i < epmap.size(); ++i) {
    in_var_names.push_back(id_name + "@" + epmap[i]);
    out_var_names.push_back(out_name + "@" + epmap[i]);
  }

  auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
  std::vector<int64_t> ids_vector;
  if (platform::is_cpu_place(id_tensor.place())) {
    auto* id_data = id_tensor.data<int64_t>();
    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
      ids_vector.push_back(id_data[i]);
    }
  } else {
#ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("paddle is not compiled with CUDA!");
#else
    auto cpu_place = platform::CPUPlace();
    framework::LoDTensor cpu_tensor;
    auto* cpu_tensor_data =
        cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
    auto stream =
        static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
    memory::Copy(cpu_place, cpu_tensor_data,
                 boost::get<platform::CUDAPlace>(id_tensor.place()),
                 id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
                 stream);
    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
      ids_vector.push_back(cpu_tensor_data[i]);
    }
#endif
  }

  auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope);
  SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
                                    &local_scope);

  // create output var in local scope
  for (auto& name : out_var_names) {
    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
  }

  std::vector<distributed::VarHandlePtr> rets;
  for (size_t i = 0; i < in_var_names.size(); i++) {
    if (NeedSend(local_scope, in_var_names[i])) {
      VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
              << " to get " << out_var_names[i] << " back";
      rets.push_back(rpc_client->AsyncPrefetchVar(
          epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i],
          table_names[i]));
    } else {
      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
    }
  }

  for (size_t i = 0; i < rets.size(); i++) {
    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
  }

  MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
                                    out_var_names, height_sections, splited_ids,
                                    context, &local_scope, &actual_ctx);
  scope.DeleteScope(&local_scope);
}

};  // namespace distributed
};  // namespace operators
};  // namespace paddle
add parameter prefetch 6 years ago			`// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`#include <set>`
			`#include <string>`
			`#include <vector>`

			`#include "paddle/fluid/operators/distributed/parameter_prefetch.h"`

			`#include "paddle/fluid/framework/lod_tensor.h"`
			`#include "paddle/fluid/framework/scope.h"`
			`#include "paddle/fluid/framework/selected_rows.h"`
			`#include "paddle/fluid/framework/tensor.h"`

Refactor distributed RPC (#15075) * wip * wip * refactor no.1 dir structure test=develop * fix linking test=develop * fix includes test=develop * fix build test=develop * fix build test=develop 6 years ago			`#include "paddle/fluid/operators/distributed/distributed.h"`
add parameter prefetch 6 years ago			`#include "paddle/fluid/operators/distributed/rpc_client.h"`
			`#include "paddle/fluid/operators/distributed/variable_response.h"`
			`#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"`

			`namespace paddle {`
			`namespace operators {`
			`namespace distributed {`

test=develop, remove sparse bias and add prefetch and related tests 6 years ago			`using LoDTensor = framework::LoDTensor;`
add parameter prefetch 6 years ago			`using LoDTensor = framework::LoDTensor;`
			`using SelectedRows = framework::SelectedRows;`
			`using DDim = framework::DDim;`

clean code 6 years ago			`static size_t GetSectionIndex(int64_t id,`
add parameter prefetch 6 years ago			`const std::vector<int64_t>& abs_sections) {`
			`for (size_t i = 1; i < abs_sections.size(); ++i) {`
			`if (id < abs_sections[i]) {`
			`return i - 1;`
			`}`
			`}`
			`return abs_sections.size() - 1;`
			`}`

clean code 6 years ago			`static std::vector<int64_t> ToAbsoluteSection(`
unit test ready 6 years ago			`const std::vector<int>& height_sections) {`
add parameter prefetch 6 years ago			`std::vector<int64_t> abs_sections;`
			`abs_sections.resize(height_sections.size());`
			`abs_sections[0] = 0;`
			`for (size_t i = 1; i < height_sections.size(); ++i) {`
			`abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];`
			`}`
			`return abs_sections;`
			`}`

clean code 6 years ago			`static std::vector<std::vector<int64_t>> SplitIds(`
prefetch support gpu test=develop 6 years ago			`const std::vector<int64_t>& ids_vector,`
			`const std::vector<int>& height_section, framework::Scope* scope) {`
add parameter prefetch 6 years ago			`std::set<int64_t> all_ids;`
prefetch support gpu test=develop 6 years ago			`for (auto id : ids_vector) {`
			`all_ids.insert(id);`
add parameter prefetch 6 years ago			`}`
prefetch support gpu test=develop 6 years ago
add parameter prefetch 6 years ago			`auto abs_sections = ToAbsoluteSection(height_section);`
			`std::vector<std::vector<int64_t>> splited_ids;`
			`splited_ids.resize(height_section.size() + 1);`
			`for (auto& id : all_ids) {`
			`auto section_index = GetSectionIndex(id, abs_sections);`
			`splited_ids[section_index].push_back(id - abs_sections[section_index]);`
			`}`
			`return splited_ids;`
			`}`

clean code 6 years ago			`static void SplitIdsIntoMultipleVarsBySection(`
prefetch support gpu test=develop 6 years ago			`const std::vector<std::string>& in_var_names,`
unit test ready 6 years ago			`const std::vector<int>& height_section,`
add parameter prefetch 6 years ago			`const std::vector<std::vector<int64_t>>& splited_ids,`
			`framework::Scope* scope) {`
fix pserver and prefetch rpc 6 years ago			`PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");`
add parameter prefetch 6 years ago
			`auto place = platform::CPUPlace();`

			`for (size_t i = 0; i < in_var_names.size(); ++i) {`
			`auto* id_tensor =`
			`scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();`
			`auto& ids = splited_ids[i];`
			`if (!ids.empty()) {`
			`auto* id_tensor_data = id_tensor->mutable_data<int64_t>(`
			`framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);`
			`memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());`
			`}`
			`}`
			`}`

clean code 6 years ago			`static void MergeMultipleVarsIntoOneBySection(`
prefetch support gpu test=develop 6 years ago			`const std::string& id_name, const std::vector<int64_t>& ids_vector,`
			`const std::string& out_name, const std::vector<std::string>& out_var_names,`
unit test ready 6 years ago			`const std::vector<int>& height_section,`
add parameter prefetch 6 years ago			`const std::vector<std::vector<int64_t>>& splited_ids,`
lookup_table gpu kernel support prefetch test=develop 6 years ago			`const framework::ExecutionContext& context, framework::Scope* scope,`
			`platform::DeviceContext* actual_ctx) {`
can run 6 years ago			`PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");`
add parameter prefetch 6 years ago
			`auto cpu_place = platform::CPUPlace();`

			`auto abs_sections = ToAbsoluteSection(height_section);`
			`std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;`
prefetch support gpu test=develop 6 years ago			`for (size_t i = 0; i < ids_vector.size(); ++i) {`
			`id_to_offset[ids_vector[i]].push_back(i);`
add parameter prefetch 6 years ago			`}`

prefetch support gpu test=develop 6 years ago			`auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();`
change Var to FindVar 6 years ago			`auto* out_tensor =`
			`scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();`
fix numel nce and prefetch 6 years ago
			`PADDLE_ENFORCE_GT(`
			`out_tensor->numel(), 0,`
test=develop, remove sparse bias and add prefetch and related tests 6 years ago			`"When calling this method, the LoDTensor's numel must larger than zero. "`
			`"Please check LoDTensor::Resize has been called first.");`
fix numel nce and prefetch 6 years ago
prefetch support gpu test=develop 6 years ago			`auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());`

			`bool is_on_cpu_place = true;`
			`if (!platform::is_cpu_place(id_tensor.place())) {`
			`is_on_cpu_place = false;`
			`}`
add parameter prefetch 6 years ago
			`for (size_t section_idx = 0; section_idx < out_var_names.size();`
			`++section_idx) {`
			`auto& ids_in_this_section = splited_ids[section_idx];`
fix some bugs 6 years ago			`if (!ids_in_this_section.empty()) {`
			`auto& prefetch_out_var =`
			`scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();`
			`const auto* out_var_data = prefetch_out_var.data<float>();`
			`auto& dims = prefetch_out_var.dims();`

			`PADDLE_ENFORCE_EQ(dims.size(), 2, "");`
			`PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);`

			`auto row_numel = dims[1];`

test=develop, remove sparse bias and add prefetch and related tests 6 years ago			`for (int64_t i = 0; i < dims[0]; ++i) {`
fix some bugs 6 years ago			`auto id = ids_in_this_section[i];`
			`auto origin_id = id + abs_sections[section_idx];`
			`auto& offsets = id_to_offset[origin_id];`
			`for (auto& offset : offsets) {`
			`// should support GPU tensor`
prefetch support gpu test=develop 6 years ago			`if (is_on_cpu_place) {`
			`memory::Copy(cpu_place, out_tensor_data + offset * row_numel,`
			`cpu_place, out_var_data + i * row_numel,`
			`sizeof(float) * row_numel);`
			`} else {`
			`#ifndef PADDLE_WITH_CUDA`
			`PADDLE_THROW("paddle is not compiled with CUDA!");`
			`#else`
lookup_table gpu kernel support prefetch test=develop 6 years ago			`auto stream =`
			`static_cast<platform::CUDADeviceContext*>(actual_ctx)->stream();`
prefetch support gpu test=develop 6 years ago			`memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),`
			`out_tensor_data + offset * row_numel, cpu_place,`
			`out_var_data + i * row_numel,`
lookup_table gpu kernel support prefetch test=develop 6 years ago			`sizeof(float) * row_numel, stream);`
prefetch support gpu test=develop 6 years ago			`#endif`
			`}`
fix some bugs 6 years ago			`}`
add parameter prefetch 6 years ago			`}`
fix some bugs 6 years ago			`} else {`
update log level in parameter prefetch test=develop 6 years ago			`VLOG(3) << "ids in this section is empty";`
add parameter prefetch 6 years ago			`}`
			`}`
			`}`

			`void prefetch(const std::string& id_name, const std::string& out_name,`
support none sliced variable 6 years ago			`const std::vector<std::string>& table_names,`
add parameter prefetch 6 years ago			`const std::vector<std::string>& epmap,`
unit test ready 6 years ago			`const std::vector<int>& height_sections,`
add scope in prefetch 6 years ago			`const framework::ExecutionContext& context,`
			`const framework::Scope& scope) {`
fix scope in nce and prefetch 6 years ago			`auto& local_scope = scope.NewScope();`
add parameter prefetch 6 years ago
			`platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();`
prefetch support gpu test=develop 6 years ago			`auto& cpu_ctx = *pool.Get(platform::CPUPlace());`
lookup_table gpu kernel support prefetch test=develop 6 years ago			`auto& actual_ctx = *pool.Get(context.GetPlace());`
add parameter prefetch 6 years ago
			`distributed::RPCClient* rpc_client =`
			`distributed::RPCClient::GetInstance<RPCCLIENT_T>(`
			`context.Attr<int>("trainer_id"));`

			`std::vector<std::string> in_var_names;`
			`std::vector<std::string> out_var_names;`
			`for (size_t i = 0; i < epmap.size(); ++i) {`
			`in_var_names.push_back(id_name + "@" + epmap[i]);`
			`out_var_names.push_back(out_name + "@" + epmap[i]);`
			`}`

add scope in prefetch 6 years ago			`auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();`
prefetch support gpu test=develop 6 years ago			`std::vector<int64_t> ids_vector;`
			`if (platform::is_cpu_place(id_tensor.place())) {`
			`auto* id_data = id_tensor.data<int64_t>();`
test=develop, remove sparse bias and add prefetch and related tests 6 years ago			`for (int64_t i = 0; i < id_tensor.numel(); ++i) {`
prefetch support gpu test=develop 6 years ago			`ids_vector.push_back(id_data[i]);`
			`}`
			`} else {`
			`#ifndef PADDLE_WITH_CUDA`
			`PADDLE_THROW("paddle is not compiled with CUDA!");`
			`#else`
			`auto cpu_place = platform::CPUPlace();`
test=develop, remove sparse bias and add prefetch and related tests 6 years ago			`framework::LoDTensor cpu_tensor;`
prefetch support gpu test=develop 6 years ago			`auto* cpu_tensor_data =`
			`cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);`
lookup_table gpu kernel support prefetch test=develop 6 years ago			`auto stream =`
			`static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();`
prefetch support gpu test=develop 6 years ago			`memory::Copy(cpu_place, cpu_tensor_data,`
			`boost::get<platform::CUDAPlace>(id_tensor.place()),`
lookup_table gpu kernel support prefetch test=develop 6 years ago			`id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),`
			`stream);`
			`for (size_t i = 0; i < cpu_tensor.numel(); ++i) {`
prefetch support gpu test=develop 6 years ago			`ids_vector.push_back(cpu_tensor_data[i]);`
			`}`
			`#endif`
			`}`

			`auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope);`
			`SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,`
			`&local_scope);`
add parameter prefetch 6 years ago
			`// create output var in local scope`
			`for (auto& name : out_var_names) {`
			`local_scope.Var(name)->GetMutable<framework::LoDTensor>();`
			`}`

			`std::vector<distributed::VarHandlePtr> rets;`
			`for (size_t i = 0; i < in_var_names.size(); i++) {`
			`if (NeedSend(local_scope, in_var_names[i])) {`
update log level in parameter prefetch test=develop 6 years ago			`VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]`
fix code format test=develop 6 years ago			`<< " to get " << out_var_names[i] << " back";`
add parameter prefetch 6 years ago			`rets.push_back(rpc_client->AsyncPrefetchVar(`
prefetch support gpu test=develop 6 years ago			`epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i],`
support none sliced variable 6 years ago			`table_names[i]));`
add parameter prefetch 6 years ago			`} else {`
update log level in parameter prefetch test=develop 6 years ago			`VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];`
add parameter prefetch 6 years ago			`}`
			`}`
fix some bugs 6 years ago
add parameter prefetch 6 years ago			`for (size_t i = 0; i < rets.size(); i++) {`
			`PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");`
			`}`

prefetch support gpu test=develop 6 years ago			`MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,`
			`out_var_names, height_sections, splited_ids,`
lookup_table gpu kernel support prefetch test=develop 6 years ago			`context, &local_scope, &actual_ctx);`
fix scope in nce and prefetch 6 years ago			`scope.DeleteScope(&local_scope);`
add parameter prefetch 6 years ago			`}`

			`}; // namespace distributed`
			`}; // namespace operators`
			`}; // namespace paddle`