async_thread_trainer & libmct & pslib.cmake

6 years ago · a77fa67bbd
parent 52a0be7bb4
commit a77fa67bbd
8 changed files with 745 additions and 23 deletions
--- a/cmake/external/libmct.cmake
+++ b/cmake/external/libmct.cmake
@ -40,9 +40,6 @@ SET(LIBMCT_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(LIBMCT_INSTALL_DIR   ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR})
 SET(LIBMCT_ROOT          ${LIBMCT_INSTALL_DIR})
 SET(LIBMCT_INC_DIR       ${LIBMCT_ROOT}/include)
 SET(LIBMCT_LIB_DIR       ${LIBMCT_ROOT}/lib)
 SET(LIBMCT_LIB           ${LIBMCT_LIB_DIR}/libps.so)
 SET(LIBMCT_IOMP_LIB      ${LIBMCT_LIB_DIR}/libiomp5.so) #todo what is this
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib")
 INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR})
@ -66,11 +63,15 @@ ExternalProject_Add(
    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT}
 )
-ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
-SET_PROPERTY(TARGET libmct PROPERTY IMPORTED_LOCATION ${LIBMCT_LIB})
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
    add_library(libmct STATIC ${dummyfile})
 else()
    add_library(libmct INTERFACE)
 endif()
 #ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
 ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
 LIST(APPEND external_project_dependencies libmct)
 IF(WITH_C_API)
  INSTALL(FILES ${LIBMCT_LIB} ${LIBMCT_IOMP_LIB} DESTINATION lib)
 ENDIF()
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@ -66,7 +66,7 @@ ExternalProject_Add(
    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
 )
-ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
+ADD_LIBRARY(pslib STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
 ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
 LIST(APPEND external_project_dependencies pslib)
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@ -48,6 +48,10 @@ void AsyncExecutor::CreateThreads(
  worker->SetDataFeed(reader);
  worker->SetFetchVarNames(fetch_var_names);
  worker->BindingDataFeedMemory();
  worker->SetPSlibPtr(_pslib_ptr);
  worker->SetPullDenseThread(_pull_dense_thread);
  worker->BindingSlotVariableMemory();
  worker->SetParamConfig(&_param_config);
 }
 void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
@ -61,6 +65,77 @@ void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
  readers[0]->SetFileList(filelist);
 }
 void AsyncExecutor::ConfigPslib(const std::string& dist_desc, std::vector<uint64_t>& host_sign_list, int node_num, int index) {
    _pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(new paddle::distributed::PSlib());
    _pslib_ptr->init_and_config(dist_desc, host_sign_list, node_num, index);//TODO
 }
 void AsyncExecutor::StartServer() {
    _pslib_ptr->run_server();
 }
 void AsyncExecutor::InitModel() {
    //TODO only rank = 0 do this
    std::vector<int> all_dense_table_id; //TODO
    all_dense_table_id.push_back(0);
    for (auto table_id: all_dense_table_id) {
        std::vector<paddle::ps::Region> regions;
        std::vector<std::string> variables;  //TODO
        for (auto& t : variables) {
            Variable* var = root_scope_->FindVar(t);
            CHECK(var != nullptr) << "var[" << t << "] not found";
            LoDTensor* tensor = var->GetMutable<LoDTensor>();
            float* g = tensor->data<float>();
            CHECK(g != nullptr) << "var[" << t << "] value not initialized";
            float init_range = 0.2;
            int rown = tensor->dims()[0];
            init_range /= sqrt(rown);
            std::normal_distribution<float> ndistr(0.0, 1.0);
            for (auto i = 0u; i < tensor->numel(); ++i) {
                g[i] = ndistr(local_random_engine()) * init_range;
            }
            paddle::ps::Region reg(g, tensor->numel());
            regions.emplace_back(std::move(reg));
        }
        auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(regions.data(), regions.size(), table_id);
        push_status.wait();
        auto status = push_status.get();
        if (status != 0) {
            LOG(FATAL) << "push dense param failed, status[" << status << "]";
            exit(-1);
        } 
    }
 }
 void AsyncExecutor::SaveModel(const std::string& path) {
    auto ret = _pslib_ptr->_worker_ptr->flush();
    ret.wait();
    ret = _pslib_ptr->_worker_ptr->save(path, 0);
    ret.wait();
    int32_t feasign_cnt = ret.get();
    if (feasign_cnt == -1) { // TODO should be feasign_cnt < 0, because server bug
        LOG(FATAL) << "save model failed";
        exit(-1);
    }
 }
 void AsyncExecutor::PrepareDenseThread() {
    DensePullThreadParam param;
    param.ps_client = _pslib_ptr->_worker_ptr;;
    param.threshold = 1;//GlobalConfig::instance().pull_dense_per_batch; //TODO
    param.training_thread_num = actual_thread_num;
    param.root_scope = root_scope_;
    //param.dense_params = &GlobalConfig::instance().dense_variable_name; //TODO
    _pull_dense_thread = std::shared_ptr<DensePullThread>(new DensePullThread(param));
 }
 void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
                                const std::string& data_feed_desc_str,
                                const std::vector<std::string>& filelist,
@ -83,7 +158,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
                                                &data_feed_desc);
-  int actual_thread_num = thread_num;
+  actual_thread_num = thread_num;
  int file_cnt = filelist.size();
  PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
@ -107,11 +182,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
  // todo: should be factory method for creating datafeed
  std::vector<std::shared_ptr<DataFeed>> readers;
  PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
-
+  PrepareDenseThread();
  std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
  workers.resize(actual_thread_num);
  for (auto& worker : workers) {
-    worker.reset(new ExecutorThreadWorker);
+    worker.reset(new AsyncExecutorThreadWorker);
  }
  // prepare thread resource here
@ -129,7 +204,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
  for (auto& th : threads) {
    th.join();
  }
-
+  _pull_dense_thread->stop();
  root_scope_->DropKids();
  return;
--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
@ -22,6 +22,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <typeinfo>
 #include <vector>
 #include <random> //local_random_engine
 #include <time.h> //local_random_engine
 #include "paddle/fluid/framework/data_feed.pb.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/executor_thread_worker.h"
@ -30,6 +32,26 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 inline double current_realtime() {
    struct timespec tp;
    clock_gettime(CLOCK_REALTIME, &tp);
    return tp.tv_sec + tp.tv_nsec * 1e-9;
 }
 inline std::default_random_engine& local_random_engine() {
    struct engine_wrapper_t {
        std::default_random_engine engine;
        engine_wrapper_t() {
            static std::atomic<unsigned long> x(0);
            std::seed_seq sseq = {x++, x++, x++, (unsigned long)(current_realtime() * 1000)};
            engine.seed(sseq);
        }
    };
    thread_local engine_wrapper_t r;
    return r.engine;
 }
 class AsyncExecutor {
 public:
  AsyncExecutor(Scope* scope, const platform::Place& place);
@ -40,9 +62,12 @@ class AsyncExecutor {
                   const int thread_num,
                   const std::vector<std::string>& fetch_names,
                   const bool debug = false);
-  void ConfigServer() {}
+  //void ConfigPslib(const char* dist_desc, uint64_t* host_sign_list, int node_num, int index);
-  void ConfigWorker() {}
+  void ConfigPslib(const std::string& dist_desc, std::vector<uint64_t>& host_sign_list, int node_num, int index);
-  void StartServer() {}
+  //void ConfigWorker() {}
  void StartServer();
  void InitModel();
  void SaveModel(const std::string& path);
 private:
  void CreateThreads(ExecutorThreadWorker* worker,
@ -51,11 +76,19 @@ class AsyncExecutor {
                     const std::vector<std::string>& fetch_var_names,
                     Scope* root_scope, const int thread_index,
                     const bool debug);
-
+  void PrepareDenseThread();
 public:
  std::shared_ptr<paddle::distributed::PSlib>  _pslib_ptr;
  std::shared_ptr<DensePullThread>  _pull_dense_thread;
  Scope* root_scope_;
  platform::Place place_;
  AsyncWorkerParamConfig _param_config;
 private:
  int actual_thread_num;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@ -25,16 +25,107 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "pslib.h"
 namespace paddle {
 namespace framework {
 const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
 void CreateTensor(Variable* var, proto::VarType::Type var_type);
 struct AsyncWorkerParamConfig {
    int slot_dim;
    int fea_dim; 
    int32_t tmp_push_dense_wait_times;
    int32_t tmp_push_sparse_wait_times;
    std::vector<std::string> slot_input_vec; //6048slot 6050slot //name
    std::vector<std::string> gradient_var;   //6048slot_embed 
 };
 struct DensePullThreadParam {
    std::shared_ptr<paddle::ps::PSClient> ps_client;
    int threshold;
    int training_thread_num;
    Scope* root_scope;
    std::map<uint64_t, std::vector<std::string>>* dense_params;
    int sleep_time_ms = 2;
 };
 class DensePullThread {
 public:
    DensePullThread(DensePullThreadParam& param) :
        _running(false) {
        _ps_client = param.ps_client;
        _threshold = param.threshold;
        _thread_num = param.training_thread_num;
        _root_scope = param.root_scope;
        _sleep_time_ms = param.sleep_time_ms;
        for (auto& t : *param.dense_params) {
            _dense_variable_name[t.first].insert(
                    _dense_variable_name[t.first].end(),
                    t.second.begin(), t.second.end());
            _training_versions[t.first].resize(_thread_num, 0);
            _last_versions[t.first] = 0;
            _current_version[t.first] = 0;
        }
    }
    int start();
    void stop() {
        if (_running) {
            _running = false;
            _t.join();
        }
    }
    void increase_thread_version(int thread_id, uint64_t table_id);
    void reset_thread_version(uint64_t table_id);
    std::future<int32_t> pull_dense(uint64_t table_id);
    void pull_dense2(uint64_t table_id);
    void wait_all();
 private:
    void run();
    bool check_update_param(uint64_t table_id);
 private:
    std::shared_ptr<paddle::ps::PSClient> _ps_client;
    int _thread_num;
    int _threshold;
    int _sleep_time_ms;
    Scope* _root_scope;
    bool _running;
    std::map<uint64_t, uint64_t> _last_versions;
    std::map<uint64_t, uint64_t> _current_version;
    std::mutex  _mutex_for_version;
    std::map<uint64_t, std::vector<uint64_t>> _training_versions;
    std::map<uint64_t, std::vector<std::string>> _dense_variable_name;
    std::thread _t;
    std::vector<::std::future<int32_t>> _pull_dense_status;
    std::map<uint64_t, std::vector<paddle::ps::Region>> _regions;
    uint32_t    _pull_dense_fail_times = 0;
    std::vector<float>  _base_norm_param;
    std::vector<float>  _mean;
    std::vector<float>  _scale;
    float _squared_sum_epsilon = 1e-4;
    std::mutex _mutex_for_mean_scale;
    float _total_batch_num = 0;
 };
 class ExecutorThreadWorker {
 public:
  ExecutorThreadWorker()
      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
-  ~ExecutorThreadWorker() {}
+  virtual ~ExecutorThreadWorker() {}
  void CreateThreadResource(const framework::ProgramDesc& program,
                            const paddle::platform::Place& place);
@ -51,10 +142,13 @@ class ExecutorThreadWorker {
  // set data feed declared in executor
  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
  // A multi-thread training function
-  void TrainFiles();
+  virtual void TrainFiles();
  // set fetch variable names from python interface assigned by users
  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
-
+  virtual void SetPSlibPtr(std::shared_ptr<paddle::distributed::PSlib> pslib_ptr);
  virtual void SetPullDenseThread(std::shared_ptr<DensePullThread>  dpt) {};
  virtual void BindingSlotVariableMemory() {};
  virtual void SetParamConfig(AsyncWorkerParamConfig* pc) {};
 private:
  void CreateThreadScope(const framework::ProgramDesc& program);
  void CreateThreadOperators(const framework::ProgramDesc& program);
@ -77,12 +171,58 @@ class ExecutorThreadWorker {
  Scope* root_scope_;
  // a thread scope, father scope is global score which is shared
  Scope* thread_scope_;
-
+  //private:
 private:
  std::vector<std::string> fetch_var_names_;
  std::vector<std::vector<float>> fetch_values_;
  bool debug_;
 };
 class AsyncExecutorThreadWorker: public ExecutorThreadWorker {
 public:
    AsyncExecutorThreadWorker(){};
    virtual ~AsyncExecutorThreadWorker() {}
    void SetPSlibPtr(std::shared_ptr<paddle::distributed::PSlib> pslib_ptr);
    void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt);
    void BindingSlotVariableMemory();
    void SetParamConfig(AsyncWorkerParamConfig* pc);
    void TrainFiles();  
    void TrainOneNetwork();
    void PrepareParams();
    void UpdateParams(); 
    void PullSparse(int table_id);
    void FillSparse(int table_id);
    void PushSparse(int table_id);
    void PushDense(int table_id);
    void check_pull_push_memory(std::vector<uint64_t>& features, std::vector<float*>& push_g, int dim);
    void check_pull_push_memory(std::vector<uint64_t>& features, std::vector<std::vector<float>>& push_g, int dim);
    void collect_feasign_info(int table_id);
 private:
    struct FeasignInfo {
        uint32_t slot;
        uint32_t ins;
        int64_t label;
    };
    std::map<uint64_t, std::vector<uint64_t>>       _features;
    std::map<uint64_t, std::vector<FeasignInfo>>    _fea_info;
    std::map<uint64_t, std::vector<std::vector<float>>> _feature_value;
    std::map<uint64_t, std::vector<std::vector<float>>> _feature_push_value;
    std::unordered_map<std::string, uint64_t>       _slot_alias_to_table; //TODO
    std::shared_ptr<paddle::distributed::PSlib>     _pslib_ptr;
    std::shared_ptr<DensePullThread>                _pull_dense_thread;
    std::vector<::std::future<int32_t>>             _pull_sparse_status;
    std::vector<::std::future<int32_t>>             _pull_dense_status;
    std::vector<::std::future<int32_t>>             _push_sparse_status;
    std::vector<::std::future<int32_t>>             _push_dense_status;
    AsyncWorkerParamConfig*                         _param_config;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/pybind/async_executor_py.cc
+++ b/paddle/fluid/pybind/async_executor_py.cc
@ -47,7 +47,11 @@ void BindAsyncExecutor(py::module* m) {
        return std::unique_ptr<framework::AsyncExecutor>(
            new framework::AsyncExecutor(scope, place));
      }))
-      .def("run_from_files", &framework::AsyncExecutor::RunFromFile);
+      .def("run_from_files", &framework::AsyncExecutor::RunFromFile)
      .def("config_pslib",   &framework::AsyncExecutor::ConfigPslib)
      .def("start_server",  &framework::AsyncExecutor::StartServer)
      .def("init_model",  &framework::AsyncExecutor::InitModel)
      .def("save_model",  &framework::AsyncExecutor::SaveModel);
 }  // end BindAsyncExecutor
 }  // end namespace pybind
 }  // end namespace paddle
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
@ -149,3 +149,16 @@ class AsyncExecutor(object):
        self.executor.run_from_files(program_desc,
                                     data_feed.desc(), filelist, thread_num,
                                     fetch_var_names, debug)
    def config_ps(self, dist_desc, host_sign_list, node_num, index):
        self.executor.config_pslib(dist_desc, host_sign_list, node_num, index)
    def start_server(self):
        self.executor.start_server()
    def init_model(self):
        self.executor.init_model()
    def save_model(self, save_path):
        self.executor.save_model(save_path)