|
|
|
@ -16,53 +16,60 @@ package client
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"context"
|
|
|
|
|
|
|
|
"errors"
|
|
|
|
|
|
|
|
"fmt"
|
|
|
|
"strconv"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
|
|
"github.com/PaddlePaddle/Paddle/go/pserver"
|
|
|
|
"github.com/PaddlePaddle/Paddle/go/pserver"
|
|
|
|
"github.com/coreos/etcd/clientv3"
|
|
|
|
"github.com/coreos/etcd/clientv3"
|
|
|
|
|
|
|
|
"github.com/coreos/etcd/clientv3/concurrency"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
const (
|
|
|
|
defaultEtcdTimeout time.Duration = 5 * time.Second
|
|
|
|
defaultEtcdTimeout time.Duration = 5 * time.Second
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
initLockPath = "/init_ps/lock"
|
|
|
|
|
|
|
|
initDonePath = "/init_ps/done"
|
|
|
|
|
|
|
|
initDoneVal = "1"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
// EtcdClient is used by pserver client that is a part of trainer process.
|
|
|
|
// Etcd is used by pserver client that is a part of trainer process.
|
|
|
|
// TODO:
|
|
|
|
// TODO:
|
|
|
|
// 1. add watcher to watch the change state of pservers)
|
|
|
|
// 1. add watcher to watch the change state of pservers.
|
|
|
|
// 1. add etcd lock)
|
|
|
|
type Etcd struct {
|
|
|
|
type EtcdClient struct {
|
|
|
|
|
|
|
|
client *clientv3.Client
|
|
|
|
client *clientv3.Client
|
|
|
|
timeout time.Duration
|
|
|
|
timeout time.Duration
|
|
|
|
endpoints []string
|
|
|
|
endpoints []string
|
|
|
|
|
|
|
|
lock *concurrency.Mutex
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Desired read ps desired number from etcd.
|
|
|
|
// Desired read ps desired number from etcd.
|
|
|
|
func (p *EtcdClient) Desired() int {
|
|
|
|
func (e *Etcd) Desired() int {
|
|
|
|
var psDesired int
|
|
|
|
var psDesired int
|
|
|
|
for {
|
|
|
|
for {
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
|
|
|
|
resp, err := p.client.Get(ctx, pserver.PsDesired)
|
|
|
|
resp, err := e.client.Get(ctx, pserver.PsDesired)
|
|
|
|
cancel()
|
|
|
|
cancel()
|
|
|
|
if err != nil {
|
|
|
|
if err != nil {
|
|
|
|
log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
|
|
|
|
log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
|
|
|
|
time.Sleep(p.timeout)
|
|
|
|
time.Sleep(e.timeout)
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
kvs := resp.Kvs
|
|
|
|
kvs := resp.Kvs
|
|
|
|
if len(kvs) == 0 {
|
|
|
|
if len(kvs) == 0 {
|
|
|
|
log.Infoln("Waiting for ps desired registered ...")
|
|
|
|
log.Infoln("Waiting for ps desired registered ...")
|
|
|
|
time.Sleep(p.timeout)
|
|
|
|
time.Sleep(e.timeout)
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
|
|
|
|
psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
|
|
|
|
if err != nil {
|
|
|
|
if err != nil {
|
|
|
|
log.Errorf("psDesired %d invalid %v", psDesired, err)
|
|
|
|
log.Errorf("psDesired %d invalid %v", psDesired, err)
|
|
|
|
time.Sleep(p.timeout)
|
|
|
|
time.Sleep(e.timeout)
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@ -73,26 +80,26 @@ func (p *EtcdClient) Desired() int {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// List return the pserver list read from etcd.
|
|
|
|
// List return the pserver list read from etcd.
|
|
|
|
func (p *EtcdClient) List() []Server {
|
|
|
|
func (e *Etcd) List() []Server {
|
|
|
|
psDesired := p.Desired()
|
|
|
|
psDesired := e.Desired()
|
|
|
|
|
|
|
|
|
|
|
|
servers := make([]Server, psDesired)
|
|
|
|
servers := make([]Server, psDesired)
|
|
|
|
for {
|
|
|
|
for {
|
|
|
|
for i := 0; i < psDesired; i++ {
|
|
|
|
for i := 0; i < psDesired; i++ {
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
|
|
|
|
psKey := pserver.PsPath + strconv.Itoa(i)
|
|
|
|
psKey := pserver.PsPath + strconv.Itoa(i)
|
|
|
|
log.Debugf("checking %s", psKey)
|
|
|
|
log.Debugf("checking %s", psKey)
|
|
|
|
resp, err := p.client.Get(ctx, psKey)
|
|
|
|
resp, err := e.client.Get(ctx, psKey)
|
|
|
|
cancel()
|
|
|
|
cancel()
|
|
|
|
if err != nil {
|
|
|
|
if err != nil {
|
|
|
|
log.Infof("Get psKey= %s error, %v", psKey, err)
|
|
|
|
log.Infof("Get psKey= %s error, %v", psKey, err)
|
|
|
|
time.Sleep(p.timeout)
|
|
|
|
time.Sleep(e.timeout)
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
kvs := resp.Kvs
|
|
|
|
kvs := resp.Kvs
|
|
|
|
if len(kvs) == 0 {
|
|
|
|
if len(kvs) == 0 {
|
|
|
|
log.Infof("Waiting for ps addr registered ...")
|
|
|
|
log.Infof("Waiting for ps addr registered ...")
|
|
|
|
time.Sleep(p.timeout)
|
|
|
|
time.Sleep(e.timeout)
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@ -100,7 +107,7 @@ func (p *EtcdClient) List() []Server {
|
|
|
|
// TODO(Longfei) check the ps address
|
|
|
|
// TODO(Longfei) check the ps address
|
|
|
|
if psAddr == "" {
|
|
|
|
if psAddr == "" {
|
|
|
|
log.Infof("Get psKey = %s, psAddr is empty", psKey)
|
|
|
|
log.Infof("Get psKey = %s, psAddr is empty", psKey)
|
|
|
|
time.Sleep(p.timeout)
|
|
|
|
time.Sleep(e.timeout)
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
log.Debugf("got value (%s) for key: %s", psAddr, psKey)
|
|
|
|
log.Debugf("got value (%s) for key: %s", psAddr, psKey)
|
|
|
|
@ -113,7 +120,7 @@ func (p *EtcdClient) List() []Server {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// NewEtcd create a etcd client to return the state of pserver on etcd.
|
|
|
|
// NewEtcd create a etcd client to return the state of pserver on etcd.
|
|
|
|
func NewEtcd(endpoints string) *EtcdClient {
|
|
|
|
func NewEtcd(endpoints string) *Etcd {
|
|
|
|
ep := strings.Split(endpoints, ",")
|
|
|
|
ep := strings.Split(endpoints, ",")
|
|
|
|
var cli *clientv3.Client
|
|
|
|
var cli *clientv3.Client
|
|
|
|
var err error
|
|
|
|
var err error
|
|
|
|
@ -130,10 +137,118 @@ func NewEtcd(endpoints string) *EtcdClient {
|
|
|
|
break
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
log.Infof("Connected to etcd: %s\n", endpoints)
|
|
|
|
log.Infof("Connected to etcd: %s\n", endpoints)
|
|
|
|
client := &EtcdClient{
|
|
|
|
client := &Etcd{
|
|
|
|
client: cli,
|
|
|
|
client: cli,
|
|
|
|
timeout: defaultEtcdTimeout,
|
|
|
|
timeout: defaultEtcdTimeout,
|
|
|
|
endpoints: ep,
|
|
|
|
endpoints: ep,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return client
|
|
|
|
return client
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Select indicates if the current trainer is selected to initialize
|
|
|
|
|
|
|
|
// the pserver parameters.
|
|
|
|
|
|
|
|
func (e *Etcd) Select() (bool, error) {
|
|
|
|
|
|
|
|
sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
return false, err
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lock := concurrency.NewMutex(sess, initLockPath)
|
|
|
|
|
|
|
|
log.Infof("Trying to acquire lock at %s.", initLockPath)
|
|
|
|
|
|
|
|
// Do not use timeout context here, since we don't know how
|
|
|
|
|
|
|
|
// long does it take for other trainers to initialize the
|
|
|
|
|
|
|
|
// parameters.
|
|
|
|
|
|
|
|
err = lock.Lock(context.Background())
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
return false, err
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Infof("Successfully acquired lock at %s.", initLockPath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
get := clientv3.OpGet(initDonePath)
|
|
|
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
|
|
|
|
|
|
|
|
tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
|
|
|
|
|
|
|
|
cancel()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
return false, err
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if !tresp.Succeeded {
|
|
|
|
|
|
|
|
return false, errors.New("no longer the owner of the lock")
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
resp := tresp.Responses[0].GetResponseRange()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(resp.Kvs) == 0 {
|
|
|
|
|
|
|
|
// Key value not set, select current trainer.
|
|
|
|
|
|
|
|
e.lock = lock
|
|
|
|
|
|
|
|
log.Infoln("Trainer selected.")
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if string(resp.Kvs[0].Value) == initDoneVal {
|
|
|
|
|
|
|
|
log.Infoln("Initialization is already done.")
|
|
|
|
|
|
|
|
ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
|
|
|
|
|
|
|
|
err = lock.Unlock(ctx)
|
|
|
|
|
|
|
|
cancel()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
log.Errorln(err)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return false, nil
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Done indicates the parameter initialization process is done.
|
|
|
|
|
|
|
|
func (e *Etcd) Done() error {
|
|
|
|
|
|
|
|
if e.lock == nil {
|
|
|
|
|
|
|
|
return errors.New("lock is nil, Done called unexpectedly")
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
put := clientv3.OpPut(initDonePath, initDoneVal)
|
|
|
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
|
|
|
|
|
|
|
|
tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
|
|
|
|
|
|
|
|
cancel()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
return err
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if !tresp.Succeeded {
|
|
|
|
|
|
|
|
return errors.New("no longer the owner of the lock")
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
|
|
|
|
|
|
|
|
err = e.lock.Unlock(ctx)
|
|
|
|
|
|
|
|
cancel()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
log.Errorln(err)
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
e.lock = nil
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Close closes the etcd client.
|
|
|
|
|
|
|
|
func (e *Etcd) Close() error {
|
|
|
|
|
|
|
|
var err error
|
|
|
|
|
|
|
|
if e.lock != nil {
|
|
|
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
|
|
|
|
|
|
|
|
err = e.lock.Unlock(ctx)
|
|
|
|
|
|
|
|
cancel()
|
|
|
|
|
|
|
|
if err == nil {
|
|
|
|
|
|
|
|
e.lock = nil
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cErr := e.client.Close()
|
|
|
|
|
|
|
|
if cErr != nil {
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
log.Errorln(cErr)
|
|
|
|
|
|
|
|
return err
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return cErr
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return err
|
|
|
|
|
|
|
|
}
|
|
|
|
|