|
|
|
@ -131,219 +131,11 @@ void PSGPUTrainer::InitOtherEnv(const ProgramDesc& main_program) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void PSGPUTrainer::Run() {
|
|
|
|
|
BuildGPUPSTask(0, 8);
|
|
|
|
|
for (size_t thidx = 0; thidx < places_.size(); ++thidx) {
|
|
|
|
|
threads_.push_back(
|
|
|
|
|
std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
void PSGPUTrainer::BuildGPUPSTask(int table_id, int feadim) {
|
|
|
|
|
VLOG(3) << "PSGPUTrainer::BuildGPUPSTask begin";
|
|
|
|
|
platform::Timer timeline;
|
|
|
|
|
timeline.Start();
|
|
|
|
|
MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
|
|
|
|
|
auto fleet_ptr = FleetWrapper::GetInstance();
|
|
|
|
|
std::shared_ptr<HeterContext> heter_context =
|
|
|
|
|
std::make_shared<HeterContext>();
|
|
|
|
|
auto& multi_output_channel = dataset->GetCurOutputChannel();
|
|
|
|
|
auto& input_channel = dataset->GetInputChannelRef();
|
|
|
|
|
int gen_shard_num = multi_output_channel.size();
|
|
|
|
|
int device_num = places_.size();
|
|
|
|
|
auto gpu_ps_wrapper = PSGPUWrapper::GetInstance();
|
|
|
|
|
auto& local_keys = heter_context->feature_keys_;
|
|
|
|
|
local_keys.resize(device_num);
|
|
|
|
|
auto& local_values = heter_context->feature_values_;
|
|
|
|
|
local_values.resize(device_num);
|
|
|
|
|
auto& local_ptr = heter_context->value_ptr_;
|
|
|
|
|
local_ptr.resize(device_num);
|
|
|
|
|
for (auto& ks : local_keys) {
|
|
|
|
|
ks.reserve(100000);
|
|
|
|
|
}
|
|
|
|
|
// read thread
|
|
|
|
|
std::vector<std::thread> threads(gen_shard_num);
|
|
|
|
|
std::vector<std::shared_ptr<ThreadPool>> consume_task_pool(device_num);
|
|
|
|
|
for (size_t i = 0; i < consume_task_pool.size(); i++) {
|
|
|
|
|
consume_task_pool[i].reset(new ::ThreadPool(1));
|
|
|
|
|
}
|
|
|
|
|
auto consume_func = [&local_keys](int shard_id, int feadim,
|
|
|
|
|
std::vector<uint64_t>& keys) {
|
|
|
|
|
local_keys[shard_id].insert(local_keys[shard_id].end(), keys.begin(),
|
|
|
|
|
keys.end());
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (input_channel->Size() == 0) {
|
|
|
|
|
// output_channel_ should hold one pass instances now
|
|
|
|
|
uint64_t output_channels_data_size = 0;
|
|
|
|
|
for (size_t i = 0; i < multi_output_channel.size(); i++) {
|
|
|
|
|
int cur_channel_size = multi_output_channel[i]->Size();
|
|
|
|
|
output_channels_data_size += cur_channel_size;
|
|
|
|
|
}
|
|
|
|
|
CHECK(output_channels_data_size > 0);
|
|
|
|
|
for (auto& ks : local_keys) {
|
|
|
|
|
ks.reserve(output_channels_data_size * 10); // magic number
|
|
|
|
|
}
|
|
|
|
|
auto gen_func = [&dataset, &device_num, &feadim, &consume_task_pool,
|
|
|
|
|
&multi_output_channel, &consume_func](int i) {
|
|
|
|
|
const std::deque<Record>& vec_data = multi_output_channel[i]->GetData();
|
|
|
|
|
std::vector<std::vector<uint64_t>> task_keys(device_num);
|
|
|
|
|
std::vector<std::future<void>> task_futures;
|
|
|
|
|
for (size_t j = 0; j < vec_data.size(); j++) {
|
|
|
|
|
for (auto& feature : vec_data[j].uint64_feasigns_) {
|
|
|
|
|
int shard = feature.sign().uint64_feasign_ % device_num;
|
|
|
|
|
task_keys[shard].push_back(feature.sign().uint64_feasign_);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int shard_id = 0; shard_id < device_num; shard_id++) {
|
|
|
|
|
task_futures.emplace_back(consume_task_pool[shard_id]->enqueue(
|
|
|
|
|
consume_func, shard_id, feadim, task_keys[shard_id]));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (auto& tf : task_futures) {
|
|
|
|
|
tf.wait();
|
|
|
|
|
}
|
|
|
|
|
for (auto& tk : task_keys) {
|
|
|
|
|
tk.clear();
|
|
|
|
|
std::vector<uint64_t>().swap(tk);
|
|
|
|
|
}
|
|
|
|
|
task_keys.clear();
|
|
|
|
|
std::vector<std::vector<uint64_t>>().swap(task_keys);
|
|
|
|
|
};
|
|
|
|
|
for (size_t i = 0; i < threads.size(); i++) {
|
|
|
|
|
threads[i] = std::thread(gen_func, i);
|
|
|
|
|
}
|
|
|
|
|
for (std::thread& t : threads) {
|
|
|
|
|
t.join();
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
int input_channel_size = input_channel->Size();
|
|
|
|
|
CHECK(input_channel_size > 0);
|
|
|
|
|
CHECK(gen_shard_num > 0);
|
|
|
|
|
for (auto& ks : local_keys) {
|
|
|
|
|
ks.reserve(input_channel_size * 10); // magic number
|
|
|
|
|
}
|
|
|
|
|
const std::deque<Record>& vec_data = input_channel->GetData();
|
|
|
|
|
auto gen_func = [&dataset, &vec_data, &device_num, &gen_shard_num,
|
|
|
|
|
&input_channel_size, &feadim, &consume_task_pool,
|
|
|
|
|
multi_output_channel, &consume_func](int i) {
|
|
|
|
|
std::vector<std::vector<uint64_t>> task_keys(device_num);
|
|
|
|
|
std::vector<std::future<void>> task_futures;
|
|
|
|
|
size_t per_shard_num = input_channel_size / gen_shard_num + 1;
|
|
|
|
|
size_t total_size = vec_data.size();
|
|
|
|
|
size_t start_index = i * per_shard_num;
|
|
|
|
|
size_t end_index =
|
|
|
|
|
std::min(start_index + per_shard_num - 1, total_size - 1);
|
|
|
|
|
for (size_t j = start_index; j <= end_index; j++) {
|
|
|
|
|
for (auto& feature : vec_data[j].uint64_feasigns_) {
|
|
|
|
|
int shard = feature.sign().uint64_feasign_ % device_num;
|
|
|
|
|
task_keys[shard].push_back(feature.sign().uint64_feasign_);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int shard_id = 0; shard_id < device_num; shard_id++) {
|
|
|
|
|
task_futures.emplace_back(consume_task_pool[shard_id]->enqueue(
|
|
|
|
|
consume_func, shard_id, feadim, task_keys[shard_id]));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (auto& tf : task_futures) {
|
|
|
|
|
tf.wait();
|
|
|
|
|
}
|
|
|
|
|
for (auto& tk : task_keys) {
|
|
|
|
|
tk.clear();
|
|
|
|
|
std::vector<uint64_t>().swap(tk);
|
|
|
|
|
}
|
|
|
|
|
task_keys.clear();
|
|
|
|
|
std::vector<std::vector<uint64_t>>().swap(task_keys);
|
|
|
|
|
};
|
|
|
|
|
for (size_t i = 0; i < threads.size(); i++) {
|
|
|
|
|
threads[i] = std::thread(gen_func, i);
|
|
|
|
|
}
|
|
|
|
|
for (std::thread& t : threads) {
|
|
|
|
|
t.join();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
timeline.Pause();
|
|
|
|
|
VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
|
|
|
|
|
timeline.Start();
|
|
|
|
|
auto unique_func = [&local_keys](int i) {
|
|
|
|
|
auto& cur_keys = local_keys[i];
|
|
|
|
|
std::sort(cur_keys.begin(), cur_keys.end());
|
|
|
|
|
cur_keys.erase(std::unique(cur_keys.begin(), cur_keys.end()),
|
|
|
|
|
cur_keys.end());
|
|
|
|
|
};
|
|
|
|
|
for (size_t i = 0; i < threads.size(); i++) {
|
|
|
|
|
threads[i] = std::thread(unique_func, i);
|
|
|
|
|
}
|
|
|
|
|
for (std::thread& t : threads) {
|
|
|
|
|
t.join();
|
|
|
|
|
}
|
|
|
|
|
timeline.Pause();
|
|
|
|
|
|
|
|
|
|
VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
|
|
|
|
|
|
|
|
|
|
timeline.Start();
|
|
|
|
|
for (size_t i = 0; i < consume_task_pool.size(); i++) {
|
|
|
|
|
consume_task_pool[i].reset();
|
|
|
|
|
}
|
|
|
|
|
consume_task_pool.clear();
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < device_num; i++) {
|
|
|
|
|
local_values[i].resize(local_keys[i].size());
|
|
|
|
|
local_ptr[i].resize(local_keys[i].size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto ptl_func = [this, &local_keys, &local_values, &local_ptr, &table_id,
|
|
|
|
|
&fleet_ptr](int i) {
|
|
|
|
|
size_t key_size = local_keys[i].size();
|
|
|
|
|
auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
|
|
|
|
|
(char**)(local_ptr[i].data()), table_id, local_keys[i].data(),
|
|
|
|
|
key_size);
|
|
|
|
|
tt.wait();
|
|
|
|
|
auto status = tt.get();
|
|
|
|
|
// auto status = 0;
|
|
|
|
|
if (status != 0) {
|
|
|
|
|
LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
|
|
|
|
|
sleep(300);
|
|
|
|
|
exit(-1);
|
|
|
|
|
} else {
|
|
|
|
|
VLOG(3) << "FleetWrapper Pull sparse to local done with table size: "
|
|
|
|
|
<< local_keys[i].size();
|
|
|
|
|
}
|
|
|
|
|
for (size_t num = 0; num < local_ptr[i].size(); ++num) {
|
|
|
|
|
float* ptr_val = local_ptr[i][num]->data();
|
|
|
|
|
FeatureValue& val = local_values[i][num];
|
|
|
|
|
size_t dim = local_ptr[i][num]->size();
|
|
|
|
|
|
|
|
|
|
val.delta_score = ptr_val[1];
|
|
|
|
|
val.show = ptr_val[2];
|
|
|
|
|
val.clk = ptr_val[3];
|
|
|
|
|
val.slot = ptr_val[6];
|
|
|
|
|
val.lr = ptr_val[4];
|
|
|
|
|
val.lr_g2sum = ptr_val[5];
|
|
|
|
|
|
|
|
|
|
if (dim > 7) {
|
|
|
|
|
val.mf_size = MF_DIM + 1;
|
|
|
|
|
for (int x = 0; x < val.mf_size; x++) {
|
|
|
|
|
val.mf[x] = ptr_val[x + 7];
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
val.mf_size = 0;
|
|
|
|
|
for (int x = 0; x < MF_DIM + 1; x++) {
|
|
|
|
|
val.mf[x] = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
for (size_t i = 0; i < threads.size(); i++) {
|
|
|
|
|
threads[i] = std::thread(ptl_func, i);
|
|
|
|
|
}
|
|
|
|
|
for (std::thread& t : threads) {
|
|
|
|
|
t.join();
|
|
|
|
|
}
|
|
|
|
|
timeline.Pause();
|
|
|
|
|
VLOG(0) << "GpuPs pull sparse cost " << timeline.ElapsedSec() << " seconds.";
|
|
|
|
|
gpu_ps_wrapper->BuildGPUPS(table_id, feadim, heter_context);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Scope* PSGPUTrainer::GetWorkerScope(int thread_id) { return nullptr; }
|
|
|
|
|
|
|
|
|
|