|
|
|
@ -2,7 +2,7 @@ package master
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"sync"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"github.com/coreos/etcd/clientv3"
|
|
|
|
|
"github.com/coreos/etcd/clientv3/concurrency"
|
|
|
|
@ -14,22 +14,22 @@ const (
|
|
|
|
|
DefaultLockPath = "/master/lock"
|
|
|
|
|
// DefaultStatePath is the default etcd key for master state.
|
|
|
|
|
DefaultStatePath = "/master/state"
|
|
|
|
|
// DefaultAddrPath is the default etcd key for master address.
|
|
|
|
|
DefaultAddrPath = "/master/addr"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Etcd is the etcd abstraction that master uses for fault tolerance
|
|
|
|
|
// EtcdClient is the etcd client that master uses for fault tolerance
|
|
|
|
|
// and service registry.
|
|
|
|
|
type Etcd struct {
|
|
|
|
|
type EtcdClient struct {
|
|
|
|
|
lockPath string
|
|
|
|
|
statePath string
|
|
|
|
|
ttlSec int
|
|
|
|
|
client *clientv3.Client
|
|
|
|
|
|
|
|
|
|
mu sync.Mutex
|
|
|
|
|
lock *concurrency.Mutex
|
|
|
|
|
lock *concurrency.Mutex
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NewEtcd creates a new Etcd.
|
|
|
|
|
func NewEtcd(endpoints []string, lockPath, statePath string, ttlSec int) (*Etcd, error) {
|
|
|
|
|
// NewEtcdClient creates a new EtcdClient.
|
|
|
|
|
func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
|
|
|
|
|
log.Debugf("Connecting to etcd at %v", endpoints)
|
|
|
|
|
// TODO(helin): gracefully shutdown etcd store. Becuase etcd
|
|
|
|
|
// store holds a etcd lock, even though the lock will expire
|
|
|
|
|
// when the lease timeout, we need to implement graceful
|
|
|
|
@ -53,27 +53,35 @@ func NewEtcd(endpoints []string, lockPath, statePath string, ttlSec int) (*Etcd,
|
|
|
|
|
// one master running, but split-brain problem may cuase
|
|
|
|
|
// multiple master servers running), and the cluster management
|
|
|
|
|
// software will kill one of them.
|
|
|
|
|
log.Infof("Trying to acquire lock at %s.", lockPath)
|
|
|
|
|
log.Debugf("Trying to acquire lock at %s.", lockPath)
|
|
|
|
|
err = lock.Lock(context.TODO())
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
log.Infof("Successfully acquired lock at %s.", lockPath)
|
|
|
|
|
|
|
|
|
|
e := &Etcd{}
|
|
|
|
|
e.client = cli
|
|
|
|
|
e.lock = lock
|
|
|
|
|
e.lockPath = lockPath
|
|
|
|
|
e.statePath = statePath
|
|
|
|
|
e.ttlSec = ttlSec
|
|
|
|
|
log.Debugf("Successfully acquired lock at %s.", lockPath)
|
|
|
|
|
|
|
|
|
|
put := clientv3.OpPut(addrPath, string(addr))
|
|
|
|
|
resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !resp.Succeeded {
|
|
|
|
|
log.Fatal("No longer owns the master lock. Exiting.")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e := &EtcdClient{
|
|
|
|
|
lockPath: lockPath,
|
|
|
|
|
statePath: statePath,
|
|
|
|
|
client: cli,
|
|
|
|
|
lock: lock,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return e, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Save saves the state into the etcd.
|
|
|
|
|
func (e *Etcd) Save(state []byte) error {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
func (e *EtcdClient) Save(state []byte) error {
|
|
|
|
|
ctx := context.TODO()
|
|
|
|
|
put := clientv3.OpPut(e.statePath, string(state))
|
|
|
|
|
resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
|
|
|
|
@ -82,17 +90,21 @@ func (e *Etcd) Save(state []byte) error {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !resp.Succeeded {
|
|
|
|
|
log.Errorln("No longer owns the lock, trying to lock and save again.")
|
|
|
|
|
sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(e.ttlSec))
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.lock = concurrency.NewMutex(sess, e.lockPath)
|
|
|
|
|
log.Infof("Try to acquire lock at %s.", e.lockPath)
|
|
|
|
|
err = e.lock.Lock(context.TODO())
|
|
|
|
|
log.Errorln("No longer owns the lock, trying to lock again")
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
|
|
|
err := e.lock.Lock(ctx)
|
|
|
|
|
cancel()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
// We lost the master lock and can not acquire
|
|
|
|
|
// it back, it means some other master is
|
|
|
|
|
// already started. We don't want cluster
|
|
|
|
|
// managment system to kill the master server
|
|
|
|
|
// who is holding the lock and running
|
|
|
|
|
// correctly. So the most feasible solution is
|
|
|
|
|
// to kill current master server. The current
|
|
|
|
|
// state is not saved, but the trainer's RPC
|
|
|
|
|
// call will fail, so the trainer will retry.
|
|
|
|
|
log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
|
|
|
|
|
}
|
|
|
|
|
log.Infof("Successfully acquired lock at %s.", e.lockPath)
|
|
|
|
|
return e.Save(state)
|
|
|
|
@ -102,8 +114,7 @@ func (e *Etcd) Save(state []byte) error {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Load loads the state from etcd.
|
|
|
|
|
func (e *Etcd) Load() ([]byte, error) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
func (e *EtcdClient) Load() ([]byte, error) {
|
|
|
|
|
ctx := context.TODO()
|
|
|
|
|
get := clientv3.OpGet(e.statePath)
|
|
|
|
|
|
|
|
|
@ -114,14 +125,7 @@ func (e *Etcd) Load() ([]byte, error) {
|
|
|
|
|
|
|
|
|
|
if !resp.Succeeded {
|
|
|
|
|
log.Errorln("No longer owns the lock, trying to lock and load again.")
|
|
|
|
|
sess, err := concurrency.NewSession(e.client)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.lock = concurrency.NewMutex(sess, e.lockPath)
|
|
|
|
|
err = e.lock.Lock(context.TODO())
|
|
|
|
|
e.mu.Unlock()
|
|
|
|
|
err = e.lock.Lock(context.Background())
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
@ -132,11 +136,9 @@ func (e *Etcd) Load() ([]byte, error) {
|
|
|
|
|
kvs := resp.Responses[0].GetResponseRange().Kvs
|
|
|
|
|
if len(kvs) == 0 {
|
|
|
|
|
// No state exists
|
|
|
|
|
e.mu.Unlock()
|
|
|
|
|
return nil, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
state := kvs[0].Value
|
|
|
|
|
e.mu.Unlock()
|
|
|
|
|
return state, nil
|
|
|
|
|
}
|