commit
af5ac2c474
@ -0,0 +1,110 @@
|
|||||||
|
# Design Doc: Save Model
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The model is the output of the training process. There are two
|
||||||
|
ways from which user can obtain a model:
|
||||||
|
|
||||||
|
- Save model triggered by user code: user code asks PaddlePaddle to
|
||||||
|
save a model.
|
||||||
|
- Convert model from the checkpoint: model being converted from
|
||||||
|
pservers' periodic checkpoint. In this way, the user can cancel a
|
||||||
|
job at any time, and still have a relatively fresh model (we
|
||||||
|
checkpoint around every 5 minutes).
|
||||||
|
|
||||||
|
### Trainer Saving Model vs. Pservers Saving Model
|
||||||
|
|
||||||
|
Both trainers and pservers have access to the model. So the model can
|
||||||
|
be saved from a trainer or pservers. We need to decide where the model
|
||||||
|
is saved from.
|
||||||
|
|
||||||
|
#### Dense Update vs. Sparse Update
|
||||||
|
|
||||||
|
There are two types of model update methods: dense update and sparse
|
||||||
|
update (when the model parameter is configured to be sparse).
|
||||||
|
|
||||||
|
- Dense update
|
||||||
|
|
||||||
|
Every trainer has it's own full copy of the model. Every model
|
||||||
|
update will update the entire model.
|
||||||
|
|
||||||
|
- Sparse update
|
||||||
|
|
||||||
|
The training input is sparse, and the trainer does not have the
|
||||||
|
entire model. It will only download the sub-model necessary related
|
||||||
|
to the input. When updating the model, only the sub-model related to
|
||||||
|
the training input is updated.
|
||||||
|
|
||||||
|
|
||||||
|
#### Pservers Saving Model
|
||||||
|
|
||||||
|
The benefit of letting pservers save model is they have the entire
|
||||||
|
model all the time. However, since pservers are on different nodes, it
|
||||||
|
requires a merging process to merge model shards into the same
|
||||||
|
model. Thus requires the pservers to write models to a distributed
|
||||||
|
filesystem, making the checkpoint shards visible to the merge program.
|
||||||
|
|
||||||
|
#### Trainer Saving Model
|
||||||
|
|
||||||
|
The benefit of letting one trainer to save the model is it does not
|
||||||
|
require a distributed filesystem. And it's reusing the same save model
|
||||||
|
logic when training locally - except when doing sparse update, the
|
||||||
|
trainer needs to download the entire model during the saving process.
|
||||||
|
|
||||||
|
#### Conclusion
|
||||||
|
|
||||||
|
Given trainer saving model does not require a distributed filesystem,
|
||||||
|
and is an intuitive extension to trainer saving model when training
|
||||||
|
locally, we decide to let the trainer save the model when doing
|
||||||
|
distributed training.
|
||||||
|
|
||||||
|
|
||||||
|
### Convert Model from Checkpoint
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
We first implement trainer save the model. Converting the latest
|
||||||
|
snapshot to a model will be a TODO for future.
|
||||||
|
|
||||||
|
|
||||||
|
## Trainer Save Model
|
||||||
|
|
||||||
|
### Trainer Election
|
||||||
|
|
||||||
|
One trainer will be elected as the one to save the model. When using
|
||||||
|
etcd, trainer ID is a randomly generated UUID, we will utilize etcd to
|
||||||
|
elect one trainer. When not using etcd, unique trainer IDs will be
|
||||||
|
given by the administrator, the trainer whose ID is "0" is elected to
|
||||||
|
save the model.
|
||||||
|
|
||||||
|
### Model Save Path
|
||||||
|
|
||||||
|
Each trainer will be given the directory to save the model. The
|
||||||
|
elected trainer will save the model to
|
||||||
|
`given-directory/trainerID`. Since the trainer ID is unique, this
|
||||||
|
would prevent concurrent save to the same file when multiple trainers
|
||||||
|
are elected to save the model when split-brain problem happens.
|
||||||
|
|
||||||
|
### What Happens When Model Is Saving
|
||||||
|
|
||||||
|
It takes some time to save model, we need to define what will happen
|
||||||
|
when save model is taking place.
|
||||||
|
|
||||||
|
When doing dense update, the trainer uses the local model. Pservers
|
||||||
|
does not need to pause model update.
|
||||||
|
|
||||||
|
When doing sparse update. The trainer needs to download the entire
|
||||||
|
model while saving. To get the most accurate model, the model update
|
||||||
|
needs to be paused before the download starts and resumed after the
|
||||||
|
download finishes. Otherwise, the trainer gets a model that is
|
||||||
|
"polluted": some part of the model is old, some part of the model is
|
||||||
|
new.
|
||||||
|
|
||||||
|
It's unclear that the "polluted" model will be inferior due to the
|
||||||
|
stochastic nature of deep learning, and pausing the model update will
|
||||||
|
add more complexity to the system. Since supporting sparse update is a
|
||||||
|
TODO item. We defer the evaluation of pause the model update or not
|
||||||
|
during saving model to the future.
|
@ -1,21 +1,3 @@
|
|||||||
cmake_minimum_required(VERSION 3.0)
|
cmake_minimum_required(VERSION 3.0)
|
||||||
|
|
||||||
get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
|
go_library(paddle_master SHARED)
|
||||||
get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
|
|
||||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
|
|
||||||
|
|
||||||
project(cxx_go C Go)
|
|
||||||
|
|
||||||
#include(golang)
|
|
||||||
include(flags)
|
|
||||||
|
|
||||||
set(MASTER_LIB_NAME "paddle_master")
|
|
||||||
go_library(${MASTER_LIB_NAME} SHARED)
|
|
||||||
|
|
||||||
if(PROJ_ROOT)
|
|
||||||
add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so
|
|
||||||
COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.h
|
|
||||||
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.so ${PROJ_ROOT}/python/paddle/v2/master/
|
|
||||||
DEPENDS ${MASTER_LIB_NAME})
|
|
||||||
add_custom_target(paddle_master_shared ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so)
|
|
||||||
endif(PROJ_ROOT)
|
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
|
|
||||||
cc_library(main SRCS main.c DEPS paddle_pserver_cclient)
|
cc_binary(main SRCS main.c DEPS paddle_pserver_cclient)
|
||||||
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
|
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
|
||||||
|
@ -0,0 +1,181 @@
|
|||||||
|
package pserver
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
|
||||||
|
"github.com/coreos/etcd/clientv3"
|
||||||
|
"github.com/coreos/etcd/clientv3/concurrency"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
// EtcdClient is the etcd client that the pserver uses for fault
|
||||||
|
// tolerance, service registry and coordination.
|
||||||
|
type EtcdClient struct {
|
||||||
|
numPservers int
|
||||||
|
etcdEndpoints string
|
||||||
|
etcdClient *clientv3.Client
|
||||||
|
// etcdTimeout is also used as retry intervals.
|
||||||
|
etcdTimeout time.Duration
|
||||||
|
// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
|
||||||
|
externalIP string
|
||||||
|
// desired number of pservers in the job.
|
||||||
|
// assume desired will not change during one training job.
|
||||||
|
desired int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewEtcdClient creates an EtcdClient
|
||||||
|
func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
|
||||||
|
return &EtcdClient{
|
||||||
|
etcdTimeout: timeout,
|
||||||
|
numPservers: numPservers,
|
||||||
|
etcdEndpoints: endpoints,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register registers the pserver on etcd
|
||||||
|
//
|
||||||
|
// Register returns the index of the current pserver.
|
||||||
|
func (e *EtcdClient) Register() (int, error) {
|
||||||
|
|
||||||
|
var err error
|
||||||
|
e.externalIP, err = networkhelper.GetExternalIP()
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize connection to etcd.
|
||||||
|
ep := strings.Split(e.etcdEndpoints, ",")
|
||||||
|
for {
|
||||||
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
Endpoints: ep,
|
||||||
|
DialTimeout: e.etcdTimeout,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("connect to etcd error: %v", err)
|
||||||
|
time.Sleep(e.etcdTimeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
e.etcdClient = cli
|
||||||
|
log.Debugf("inited client to %s", e.etcdEndpoints)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// init /ps_desired using transaction, for multiple pservers may want to write
|
||||||
|
// it at the same time.
|
||||||
|
for {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||||
|
_, err := e.initDesiredPsercers(ctx, e.numPservers)
|
||||||
|
cancel()
|
||||||
|
if err != nil {
|
||||||
|
log.Warn(err)
|
||||||
|
time.Sleep(e.etcdTimeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// TODO: when implementing extending or reducing pservers, /ps_desired is
|
||||||
|
// changed, then we need to watch /ps_desired node for events. For now, just
|
||||||
|
// write once when init and read from it.
|
||||||
|
// wait and set s.desired init value
|
||||||
|
for {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||||
|
resp, err := e.etcdClient.Get(ctx, PsDesired)
|
||||||
|
cancel()
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("getting %s error: %v", PsDesired, err)
|
||||||
|
time.Sleep(e.etcdTimeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(resp.Kvs) != 0 {
|
||||||
|
e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("value of %s invalid %v\n", PsDesired, err)
|
||||||
|
time.Sleep(e.etcdTimeout)
|
||||||
|
// NOTE: wait util ps_desired value change
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var pserverIdx int
|
||||||
|
// try register pserver node on etcd
|
||||||
|
for {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||||
|
var err error
|
||||||
|
pserverIdx, err = e.registerPserverEtcd(ctx)
|
||||||
|
cancel()
|
||||||
|
if err != nil {
|
||||||
|
log.Warn(err)
|
||||||
|
time.Sleep(e.etcdTimeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
return pserverIdx, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
|
||||||
|
return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
|
||||||
|
dsStr := c.Get(PsDesired)
|
||||||
|
if dsStr == "" {
|
||||||
|
c.Put(PsDesired, strconv.Itoa(numPservers))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
|
||||||
|
}
|
||||||
|
|
||||||
|
// registerPserverEtcd registers pserver node on etcd using transaction.
|
||||||
|
func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
|
||||||
|
var idx int
|
||||||
|
_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
|
||||||
|
registered := false
|
||||||
|
for i := 0; i < e.desired; i++ {
|
||||||
|
psKey := "/ps/" + strconv.Itoa(i)
|
||||||
|
log.Debugf("checking %s", psKey)
|
||||||
|
ps := c.Get(psKey)
|
||||||
|
log.Debugf("got value (%s) for key: %s", ps, psKey)
|
||||||
|
|
||||||
|
if ps == "" {
|
||||||
|
resp, err := e.etcdClient.Grant(context.TODO(), 5)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
// find the first id and write info
|
||||||
|
c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
|
||||||
|
log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
|
||||||
|
ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
|
||||||
|
if kaerr != nil {
|
||||||
|
log.Errorf("keepalive etcd node error: %v", kaerr)
|
||||||
|
return kaerr
|
||||||
|
}
|
||||||
|
|
||||||
|
// Eat the keep alive message so etcd
|
||||||
|
// will not expire the lease.
|
||||||
|
go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
|
||||||
|
ka := <-ch
|
||||||
|
log.Debugf("keepalive: %d\n", ka.TTL)
|
||||||
|
}(ch)
|
||||||
|
log.Debug("register finished")
|
||||||
|
idx = i
|
||||||
|
registered = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if registered == true {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return errors.New("not registerd, may due to already have enough pservers")
|
||||||
|
}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return idx, nil
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue