Paddle/paddle/fluid/framework/multi_trainer.cc

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <string>
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"

#if defined PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/service/communicator.h"
#endif

namespace paddle {
namespace framework {

void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
                              Dataset* dataset) {
  thread_num_ = trainer_desc.thread_num();
  SetDataset(dataset);

  ParseDumpConfig(trainer_desc);
  mpi_rank_ = trainer_desc.mpi_rank();
  mpi_size_ = trainer_desc.mpi_size();
  dump_file_num_ = trainer_desc.dump_file_num();

  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
       i++) {
    need_merge_var_names_.push_back(
        trainer_desc.downpour_param().stat_var_names(i));
  }
  // get filelist from trainer_desc here
  const std::vector<paddle::framework::DataFeed*> readers =
      dataset->GetReaders();
  VLOG(3) << "readers num: " << readers.size();
  // change thread num to readers num
  thread_num_ = readers.size();
  VLOG(3) << "worker thread num: " << thread_num_;
  workers_.resize(thread_num_);

#if defined PADDLE_WITH_PSCORE
  if (trainer_desc.thread_barrier()) {
    paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
        thread_num_);
  }
#endif

  for (int i = 0; i < thread_num_; ++i) {
    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
        trainer_desc.device_worker_name());
    workers_[i]->SetNeedDumpField(need_dump_field_);
    workers_[i]->SetNeedDumpParam(need_dump_param_);
    workers_[i]->SetDumpFieldVector(dump_fields_);
    workers_[i]->SetDumpParamVector(dump_param_);
    workers_[i]->InitRandomDumpConfig(trainer_desc);
    workers_[i]->Initialize(trainer_desc);
    workers_[i]->SetDeviceIndex(i);
    workers_[i]->SetDataFeed(readers[i]);
  }

  // set debug here
  SetDebug(trainer_desc.debug());
}

std::string MultiTrainer::GetDumpPath(int tid) {
  if (user_define_dump_filename_ != "") {
    return string::format_string("%s/part-%s-%05d", dump_fields_path_.c_str(),
                                 user_define_dump_filename_.c_str(), tid);
  }
  return string::format_string("%s/part-%03d-%05d", dump_fields_path_.c_str(),
                               mpi_rank_, tid);
}

void MultiTrainer::InitDumpEnv() {
  queue_ = paddle::framework::MakeChannel<std::string>();
  for (int i = 0; i < thread_num_; ++i) {
    workers_[i]->SetChannelWriter(queue_.get());
  }
  dump_thread_num_ = 1;
  if (dump_file_num_ > mpi_size_) {
    dump_thread_num_ = dump_file_num_ / mpi_size_;
    if (dump_file_num_ % mpi_size_ > mpi_rank_) {
      dump_thread_num_ += 1;
    }
  }
  for (int i = 0; i < dump_thread_num_; i++) {
    dump_thread_.push_back(
        std::thread(std::bind(&TrainerBase::DumpWork, this, i)));
  }
}

// call only after all resources are set in current trainer
void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                  const platform::Place& place) {
  for (int i = 0; i < thread_num_; ++i) {
    workers_[i]->SetPlace(place);
    workers_[i]->SetReaderPlace(place);
    workers_[i]->SetRootScope(root_scope_);
    workers_[i]->CreateDeviceResource(main_program);  // Program
    workers_[i]->BindingDataFeedMemory();
    workers_[i]->CacheProgram(main_program);
  }
}

void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
  if (need_dump_field_ || need_dump_param_) {
    InitDumpEnv();
  }
  VLOG(3) << "init other env done.";
}

Scope* MultiTrainer::GetWorkerScope(int thread_id) {
  return workers_[thread_id]->GetThreadScope();
}

void MultiTrainer::Run() {
  VLOG(3) << "Going to run";
  for (int thidx = 0; thidx < thread_num_; ++thidx) {
    if (!debug_) {
      threads_.push_back(
          std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get()));
    } else {
      threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler,
                                     workers_[thidx].get()));
    }
  }
  for (auto& th : threads_) {
    th.join();
  }
}

void MultiTrainer::Finalize() {
  if (need_dump_field_ || need_dump_param_) {
    FinalizeDumpEnv();
  }
  root_scope_->DropKids();
}

}  // end namespace framework
}  // end namespace paddle
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`

			`#include <string>`
			`#include "paddle/fluid/framework/device_worker_factory.h"`
			`#include "paddle/fluid/framework/trainer.h"`
[Feature] one ps (3/4) (#29604) * oneps (3/4) Co-authored-by: MrChengmo <cmchengmo@163.com> Co-authored-by: malin10 <malin10@baidu.com> Co-authored-by: chengmo <chengmo@baidu.com> 4 years ago
Fix/distributed proto (#29981) * rename sendrecv.proto to namespace paddle.distributed * split ps with distributed 4 years ago			`#if defined PADDLE_WITH_PSCORE`
[Feature] one ps (3/4) (#29604) * oneps (3/4) Co-authored-by: MrChengmo <cmchengmo@163.com> Co-authored-by: malin10 <malin10@baidu.com> Co-authored-by: chengmo <chengmo@baidu.com> 4 years ago			`#include "paddle/fluid/distributed/service/communicator.h"`
			`#endif`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago
			`namespace paddle {`
			`namespace framework {`

add run from dataset in executor. 6 years ago			`void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,`
modify c++ and python dataset related code & fix bug 6 years ago			`Dataset* dataset) {`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`thread_num_ = trainer_desc.thread_num();`
support dumping params/grads in transpiler mode (#22490) 5 years ago			`SetDataset(dataset);`

Random Dump (#24477) * Refactor code for dump_field & dump_param: abstracting the common function in base class. * Support dump randomly & random with lineid * Support specify the random interval, which avoids printing too much logs. 5 years ago			`ParseDumpConfig(trainer_desc);`
support dumping params/grads in transpiler mode (#22490) 5 years ago			`mpi_rank_ = trainer_desc.mpi_rank();`
			`mpi_size_ = trainer_desc.mpi_size();`
			`dump_file_num_ = trainer_desc.dump_file_num();`

add thread scope stat accurate metrics test=develop (#19480) * add thread scope stat accurate metrics test=develop * fix style * fix style * fix style * fix style test=develop * fix style test=develop * fix style test=develop * fix style test=develop * fix style test=develop * fix style test=develop * fix style test=develop * fix conflict * fix style * fix style test=develop * fix error test=develop * fix error test=develop 6 years ago			`for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();`
			`i++) {`
			`need_merge_var_names_.push_back(`
			`trainer_desc.downpour_param().stat_var_names(i));`
			`}`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`// get filelist from trainer_desc here`
dataset (#17973) (1) use channel instead of vector/BlockingQueue in Dataset，to keep same with existing implementation, and make code more readable and flexible (dataset single output channel or multi output channel). one previous memory out of limit problem is cause by not release memory after training. (2) add Record because MultiSlotType costs too much memory (80B)，fix memory out of limit problem. (3) add Channel, Archive in paddle/fluid/framework (4) change dataset from shared_ptr to unique_ptr in pybind (5) move create/destroy readers from trainer to dataset (6) move shuffle from datafeed to dataset. dataset holds memory, datafeed is only for load data and feed data to network. (7) fix thread num bug of Dataset when filelist size < thread num (8) support set_queue_num in InMemoryDataset 6 years ago			`const std::vector<paddle::framework::DataFeed*> readers =`
make Dataset* as an argument 6 years ago			`dataset->GetReaders();`
fix data reading bugs in api, add VLOG(3) log for setup 6 years ago			`VLOG(3) << "readers num: " << readers.size();`
support multi dataset && add init model && fix bug 6 years ago			`// change thread num to readers num`
			`thread_num_ = readers.size();`
			`VLOG(3) << "worker thread num: " << thread_num_;`
			`workers_.resize(thread_num_);`
integrated HALF_ASYNC to communicator (#21869) * add half_async in the communicator * fix DistributedStrategy 5 years ago
Fix/distributed proto (#29981) * rename sendrecv.proto to namespace paddle.distributed * split ps with distributed 4 years ago			`#if defined PADDLE_WITH_PSCORE`
integrated HALF_ASYNC to communicator (#21869) * add half_async in the communicator * fix DistributedStrategy 5 years ago			`if (trainer_desc.thread_barrier()) {`
[Feature] one ps (3/4) (#29604) * oneps (3/4) Co-authored-by: MrChengmo <cmchengmo@163.com> Co-authored-by: malin10 <malin10@baidu.com> Co-authored-by: chengmo <chengmo@baidu.com> 4 years ago			`paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(`
integrated HALF_ASYNC to communicator (#21869) * add half_async in the communicator * fix DistributedStrategy 5 years ago			`thread_num_);`
			`}`
			`#endif`

add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`for (int i = 0; i < thread_num_; ++i) {`
			`workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(`
			`trainer_desc.device_worker_name());`
Random Dump (#24477) * Refactor code for dump_field & dump_param: abstracting the common function in base class. * Support dump randomly & random with lineid * Support specify the random interval, which avoids printing too much logs. 5 years ago			`workers_[i]->SetNeedDumpField(need_dump_field_);`
			`workers_[i]->SetNeedDumpParam(need_dump_param_);`
			`workers_[i]->SetDumpFieldVector(dump_fields_);`
			`workers_[i]->SetDumpParamVector(dump_param_);`
			`workers_[i]->InitRandomDumpConfig(trainer_desc);`
refine print fetch list 6 years ago			`workers_[i]->Initialize(trainer_desc);`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`workers_[i]->SetDeviceIndex(i);`
make Dataset* as an argument 6 years ago			`workers_[i]->SetDataFeed(readers[i]);`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`}`
make Dataset* as an argument 6 years ago
			`// set debug here`
add comment for MPI Symetric role maker test=develop 6 years ago			`SetDebug(trainer_desc.debug());`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`}`

Random Dump (#24477) * Refactor code for dump_field & dump_param: abstracting the common function in base class. * Support dump randomly & random with lineid * Support specify the random interval, which avoids printing too much logs. 5 years ago			`std::string MultiTrainer::GetDumpPath(int tid) {`
add user_define_dump (#28596) 4 years ago			`if (user_define_dump_filename_ != "") {`
			`return string::format_string("%s/part-%s-%05d", dump_fields_path_.c_str(),`
			`user_define_dump_filename_.c_str(), tid);`
			`}`
Random Dump (#24477) * Refactor code for dump_field & dump_param: abstracting the common function in base class. * Support dump randomly & random with lineid * Support specify the random interval, which avoids printing too much logs. 5 years ago			`return string::format_string("%s/part-%03d-%05d", dump_fields_path_.c_str(),`
			`mpi_rank_, tid);`
support dumping params/grads in transpiler mode (#22490) 5 years ago			`}`

			`void MultiTrainer::InitDumpEnv() {`
			`queue_ = paddle::framework::MakeChannel<std::string>();`
			`for (int i = 0; i < thread_num_; ++i) {`
			`workers_[i]->SetChannelWriter(queue_.get());`
			`}`
			`dump_thread_num_ = 1;`
			`if (dump_file_num_ > mpi_size_) {`
			`dump_thread_num_ = dump_file_num_ / mpi_size_;`
			`if (dump_file_num_ % mpi_size_ > mpi_rank_) {`
			`dump_thread_num_ += 1;`
			`}`
			`}`
			`for (int i = 0; i < dump_thread_num_; i++) {`
			`dump_thread_.push_back(`
Random Dump (#24477) * Refactor code for dump_field & dump_param: abstracting the common function in base class. * Support dump randomly & random with lineid * Support specify the random interval, which avoids printing too much logs. 5 years ago			`std::thread(std::bind(&TrainerBase::DumpWork, this, i)));`
support dumping params/grads in transpiler mode (#22490) 5 years ago			`}`
			`}`

add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`// call only after all resources are set in current trainer`
			`void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,`
			`const platform::Place& place) {`
			`for (int i = 0; i < thread_num_; ++i) {`
			`workers_[i]->SetPlace(place);`
Datafeed support reading to cuda place directly. (#19071) * add a place field in DataFeed to denote which place it will feed data to. * abstract the copy process in CopyToFeedTensor function * add UT for float32 type and for CUDAPlace 6 years ago			`workers_[i]->SetReaderPlace(place);`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`workers_[i]->SetRootScope(root_scope_);`
			`workers_[i]->CreateDeviceResource(main_program); // Program`
			`workers_[i]->BindingDataFeedMemory();`
add heter ps mode (#25682) * add heter ps mode * code style test=develop * add with_pslib test=develop * unitest test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * test monitor test=develop * prepare trainer test=develop * code style test=develop 5 years ago			`workers_[i]->CacheProgram(main_program);`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`}`
			`}`

support dumping params/grads in transpiler mode (#22490) 5 years ago			`void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {`
fix dump, fix cvm check (#25400) * fix dump, fix cvm check test=develop * fix test=develop * fix test=develop * fix test=develop 5 years ago			`if (need_dump_field_ \|\| need_dump_param_) {`
support dumping params/grads in transpiler mode (#22490) 5 years ago			`InitDumpEnv();`
			`}`
			`VLOG(3) << "init other env done.";`
			`}`

trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago			`Scope* MultiTrainer::GetWorkerScope(int thread_id) {`
			`return workers_[thread_id]->GetThreadScope();`
			`}`

add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`void MultiTrainer::Run() {`
fix data reading bugs in api, add VLOG(3) log for setup 6 years ago			`VLOG(3) << "Going to run";`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`for (int thidx = 0; thidx < thread_num_; ++thidx) {`
add comment for MPI Symetric role maker test=develop 6 years ago			`if (!debug_) {`
			`threads_.push_back(`
			`std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get()));`
			`} else {`
			`threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler,`
			`workers_[thidx].get()));`
			`}`
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`}`
			`for (auto& th : threads_) {`
			`th.join();`
			`}`
			`}`

support dumping params/grads in transpiler mode (#22490) 5 years ago			`void MultiTrainer::Finalize() {`
fix dump, fix cvm check (#25400) * fix dump, fix cvm check test=develop * fix test=develop * fix test=develop * fix test=develop 5 years ago			`if (need_dump_field_ \|\| need_dump_param_) {`
support dumping params/grads in transpiler mode (#22490) 5 years ago			`FinalizeDumpEnv();`
			`}`
			`root_scope_->DropKids();`
			`}`
Refactor fetch handler (#21264) * fix fetch handler problem and refactor when a user define FetchHandler class, he or she should initialize a handler with variable dict. the key of a variable dict is a user defined name, the value of a variable dict is a Varaible generated from python API. For each fetching, a user should implement handler function in which fetched_result_dict will be available and the user can access the fetched value with user defined keys. 5 years ago
add dist_multi_trainer for distributed training, add trainer_factory and device_worker_factory so that we can easily extend new training mode, add pull dense worker which is a singleton for parameter fetching 6 years ago			`} // end namespace framework`
			`} // end namespace paddle`