|
|
|
@ -34,16 +34,19 @@ const (
|
|
|
|
|
PsPath = "/ps/"
|
|
|
|
|
// PsCheckpoint is the etcd path for store checkpoints information
|
|
|
|
|
PsCheckpoint = "/checkpoints/"
|
|
|
|
|
|
|
|
|
|
retryTimeout = 5 * time.Second
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// EtcdClient is the etcd client that the pserver uses for fault
|
|
|
|
|
// tolerance, service registry and coordination.
|
|
|
|
|
type EtcdClient struct {
|
|
|
|
|
numPservers int
|
|
|
|
|
etcdEndpoints string
|
|
|
|
|
etcdClient *clientv3.Client
|
|
|
|
|
// etcdTimeout is also used as retry intervals.
|
|
|
|
|
etcdTimeout time.Duration
|
|
|
|
|
numPservers int
|
|
|
|
|
endpoints string
|
|
|
|
|
client *clientv3.Client
|
|
|
|
|
sess *concurrency.Session
|
|
|
|
|
dialTimeout time.Duration
|
|
|
|
|
ttlSec int
|
|
|
|
|
// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
|
|
|
|
|
externalIP string
|
|
|
|
|
// desired number of pservers in the job.
|
|
|
|
@ -52,11 +55,12 @@ type EtcdClient struct {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NewEtcdClient creates an EtcdClient
|
|
|
|
|
func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
|
|
|
|
|
func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient {
|
|
|
|
|
return &EtcdClient{
|
|
|
|
|
etcdTimeout: timeout,
|
|
|
|
|
numPservers: numPservers,
|
|
|
|
|
etcdEndpoints: endpoints,
|
|
|
|
|
dialTimeout: dialtimeout,
|
|
|
|
|
ttlSec: ttlSec,
|
|
|
|
|
numPservers: numPservers,
|
|
|
|
|
endpoints: endpoints,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -64,7 +68,6 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
|
|
|
|
|
//
|
|
|
|
|
// Register returns the index of the current pserver.
|
|
|
|
|
func (e *EtcdClient) Register(port int) (int, error) {
|
|
|
|
|
|
|
|
|
|
var err error
|
|
|
|
|
e.externalIP, err = networkhelper.GetExternalIP()
|
|
|
|
|
if err != nil {
|
|
|
|
@ -72,19 +75,26 @@ func (e *EtcdClient) Register(port int) (int, error) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// initialize connection to etcd.
|
|
|
|
|
ep := strings.Split(e.etcdEndpoints, ",")
|
|
|
|
|
ep := strings.Split(e.endpoints, ",")
|
|
|
|
|
for {
|
|
|
|
|
cli, err := clientv3.New(clientv3.Config{
|
|
|
|
|
Endpoints: ep,
|
|
|
|
|
DialTimeout: e.etcdTimeout,
|
|
|
|
|
DialTimeout: e.dialTimeout,
|
|
|
|
|
})
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Errorf("connect to etcd error: %v", err)
|
|
|
|
|
time.Sleep(e.etcdTimeout)
|
|
|
|
|
time.Sleep(retryTimeout)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
e.client = cli
|
|
|
|
|
sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Errorf("create etcd session error: %v", err)
|
|
|
|
|
time.Sleep(retryTimeout)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
e.etcdClient = cli
|
|
|
|
|
log.Debugf("inited client to %s", e.etcdEndpoints)
|
|
|
|
|
e.sess = sess
|
|
|
|
|
log.Debugf("inited client to %s", e.endpoints)
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// init /ps_desired using transaction, for multiple pservers may want to write
|
|
|
|
@ -95,7 +105,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
|
|
|
|
|
cancel()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Warn(err)
|
|
|
|
|
time.Sleep(e.etcdTimeout)
|
|
|
|
|
time.Sleep(retryTimeout)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
@ -106,18 +116,18 @@ func (e *EtcdClient) Register(port int) (int, error) {
|
|
|
|
|
// wait and set s.desired init value
|
|
|
|
|
for {
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
|
|
|
|
resp, err := e.etcdClient.Get(ctx, PsDesired)
|
|
|
|
|
resp, err := e.client.Get(ctx, PsDesired)
|
|
|
|
|
cancel()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Errorf("getting %s error: %v", PsDesired, err)
|
|
|
|
|
time.Sleep(e.etcdTimeout)
|
|
|
|
|
time.Sleep(retryTimeout)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if len(resp.Kvs) != 0 {
|
|
|
|
|
e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Errorf("value of %s invalid %v\n", PsDesired, err)
|
|
|
|
|
time.Sleep(e.etcdTimeout)
|
|
|
|
|
time.Sleep(retryTimeout)
|
|
|
|
|
// NOTE: wait util ps_desired value change
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
@ -134,7 +144,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
|
|
|
|
|
cancel()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Warn(err)
|
|
|
|
|
time.Sleep(e.etcdTimeout)
|
|
|
|
|
time.Sleep(retryTimeout)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
@ -144,10 +154,10 @@ func (e *EtcdClient) Register(port int) (int, error) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
|
|
|
|
|
return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
|
|
|
|
|
return concurrency.NewSTM(e.client, func(c concurrency.STM) error {
|
|
|
|
|
dsStr := c.Get(PsDesired)
|
|
|
|
|
if dsStr == "" {
|
|
|
|
|
c.Put(PsDesired, strconv.Itoa(numPservers))
|
|
|
|
|
c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease()))
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
|
|
|
|
@ -156,7 +166,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
|
|
|
|
|
// registerPserverEtcd registers pserver node on etcd using transaction.
|
|
|
|
|
func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
|
|
|
|
|
var idx int
|
|
|
|
|
_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
|
|
|
|
|
_, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error {
|
|
|
|
|
registered := false
|
|
|
|
|
for i := 0; i < e.desired; i++ {
|
|
|
|
|
psKey := PsPath + strconv.Itoa(i)
|
|
|
|
@ -165,26 +175,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
|
|
|
|
|
log.Debugf("got value (%s) for key: %s", ps, psKey)
|
|
|
|
|
|
|
|
|
|
if ps == "" {
|
|
|
|
|
resp, err := e.etcdClient.Grant(context.TODO(), 5)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
// find the first id and write info
|
|
|
|
|
pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
|
|
|
|
|
c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
|
|
|
|
|
c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
|
|
|
|
|
log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
|
|
|
|
|
ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
|
|
|
|
|
if kaerr != nil {
|
|
|
|
|
log.Errorf("keepalive etcd node error: %v", kaerr)
|
|
|
|
|
return kaerr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Eat the keep alive message so etcd
|
|
|
|
|
// will not expire the lease.
|
|
|
|
|
go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
|
|
|
|
|
ka := <-ch
|
|
|
|
|
log.Debugf("keepalive: %d\n", ka.TTL)
|
|
|
|
|
}(ch)
|
|
|
|
|
log.Debug("register finished")
|
|
|
|
|
idx = i
|
|
|
|
|
registered = true
|
|
|
|
@ -207,7 +201,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
|
|
|
|
|
// GetKey gets the value by the specified key
|
|
|
|
|
func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
|
|
|
resp, err := e.etcdClient.Get(ctx, key)
|
|
|
|
|
resp, err := e.client.Get(ctx, key)
|
|
|
|
|
cancel()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return []byte{}, err
|
|
|
|
@ -223,7 +217,27 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
|
|
|
|
|
// PutKey put into etcd with value by key specified
|
|
|
|
|
func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
|
|
|
_, err := e.etcdClient.Put(ctx, key, string(value))
|
|
|
|
|
_, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
|
|
|
|
|
cancel()
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Shutdown shuts down the etcd client gracefully.
|
|
|
|
|
func (e *EtcdClient) Shutdown() error {
|
|
|
|
|
var err error
|
|
|
|
|
if e.sess != nil {
|
|
|
|
|
err = e.sess.Close()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if e.client != nil {
|
|
|
|
|
newErr := e.client.Close()
|
|
|
|
|
if newErr != nil {
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Errorln(newErr)
|
|
|
|
|
} else {
|
|
|
|
|
err = newErr
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|