You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							267 lines
						
					
					
						
							6.4 KiB
						
					
					
				
			
		
		
	
	
							267 lines
						
					
					
						
							6.4 KiB
						
					
					
				| // Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 | |
| 
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| 
 | |
| // http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package client
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"errors"
 | |
| 	"fmt"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"time"
 | |
| 
 | |
| 	"github.com/PaddlePaddle/Paddle/go/pserver"
 | |
| 	"github.com/coreos/etcd/clientv3"
 | |
| 	"github.com/coreos/etcd/clientv3/concurrency"
 | |
| 	log "github.com/inconshreveable/log15"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	defaultEtcdTimeout time.Duration = 5 * time.Second
 | |
| 
 | |
| 	initLockPath = "/init_ps/lock"
 | |
| 	initDonePath = "/init_ps/done"
 | |
| 	initDoneVal  = "1"
 | |
| )
 | |
| 
 | |
| // Etcd is used by pserver client that is a part of trainer process.
 | |
| // TODO:
 | |
| // 1. add watcher to watch the change state of pservers.
 | |
| type Etcd struct {
 | |
| 	client    *clientv3.Client
 | |
| 	timeout   time.Duration
 | |
| 	endpoints []string
 | |
| 	lock      *concurrency.Mutex
 | |
| }
 | |
| 
 | |
| // Desired read ps desired number from etcd.
 | |
| func (e *Etcd) Desired() int {
 | |
| 	var psDesired int
 | |
| 	for {
 | |
| 		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 | |
| 		resp, err := e.client.Get(ctx, pserver.PsDesired)
 | |
| 		cancel()
 | |
| 		if err != nil {
 | |
| 			log.Error(
 | |
| 				"Get ps dresire number failed! reconnecting...",
 | |
| 				log.Ctx{"error": err},
 | |
| 			)
 | |
| 			time.Sleep(e.timeout)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		kvs := resp.Kvs
 | |
| 		if len(kvs) == 0 {
 | |
| 			log.Info("Waiting for ps desired registered ...")
 | |
| 			time.Sleep(e.timeout)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 | |
| 		if err != nil {
 | |
| 			log.Error("atoi failed", log.Ctx{"error": err})
 | |
| 			time.Sleep(e.timeout)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		log.Debug("Got psDesired", log.Ctx{"psDesired": psDesired})
 | |
| 		break
 | |
| 	}
 | |
| 	return psDesired
 | |
| }
 | |
| 
 | |
| // List return the pserver list read from etcd.
 | |
| func (e *Etcd) List() []Server {
 | |
| 	psDesired := e.Desired()
 | |
| 
 | |
| 	servers := make([]Server, psDesired)
 | |
| 	for {
 | |
| 		for i := 0; i < psDesired; i++ {
 | |
| 			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 | |
| 			psKey := pserver.PsPath + strconv.Itoa(i)
 | |
| 			log.Debug("looking for pserver", log.Ctx{"ps key": psKey})
 | |
| 			resp, err := e.client.Get(ctx, psKey)
 | |
| 			cancel()
 | |
| 			if err != nil {
 | |
| 				log.Info(
 | |
| 					"Get psKey error",
 | |
| 					log.Ctx{"ps key": psKey, "error": err},
 | |
| 				)
 | |
| 				time.Sleep(e.timeout)
 | |
| 				continue
 | |
| 			}
 | |
| 			kvs := resp.Kvs
 | |
| 			if len(kvs) == 0 {
 | |
| 				log.Info("Waiting for ps addr registered ...")
 | |
| 				time.Sleep(e.timeout)
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			psAddr := string(resp.Kvs[0].Value)
 | |
| 			// TODO(Longfei) check the ps address
 | |
| 			if psAddr == "" {
 | |
| 				log.Info(
 | |
| 					"Value under psKey is empty",
 | |
| 					log.Ctx{"psKey": psKey},
 | |
| 				)
 | |
| 				time.Sleep(e.timeout)
 | |
| 				continue
 | |
| 			}
 | |
| 			log.Debug(
 | |
| 				"got psAddr given psKey",
 | |
| 				log.Ctx{"psAddr": psAddr, "psKey": psKey},
 | |
| 			)
 | |
| 			servers[i].Index = i
 | |
| 			servers[i].Addr = psAddr
 | |
| 		}
 | |
| 		break
 | |
| 	}
 | |
| 	return servers
 | |
| }
 | |
| 
 | |
| // NewEtcd create a etcd client to return the state of pserver on etcd.
 | |
| func NewEtcd(endpoints string) *Etcd {
 | |
| 	ep := strings.Split(endpoints, ",")
 | |
| 	var cli *clientv3.Client
 | |
| 	var err error
 | |
| 	for {
 | |
| 		cli, err = clientv3.New(clientv3.Config{
 | |
| 			Endpoints:   ep,
 | |
| 			DialTimeout: defaultEtcdTimeout,
 | |
| 		})
 | |
| 		if err != nil {
 | |
| 			log.Error("Init etcd connection failed", log.Ctx{"error": err})
 | |
| 			time.Sleep(defaultEtcdTimeout)
 | |
| 			continue
 | |
| 		}
 | |
| 		break
 | |
| 	}
 | |
| 	log.Info("Connected to etcd endpoint", log.Ctx{"endpoint": endpoints})
 | |
| 	client := &Etcd{
 | |
| 		client:    cli,
 | |
| 		timeout:   defaultEtcdTimeout,
 | |
| 		endpoints: ep,
 | |
| 	}
 | |
| 	return client
 | |
| }
 | |
| 
 | |
| // Select indicates if the current trainer is selected to initialize
 | |
| // the pserver parameters.
 | |
| func (e *Etcd) Select() (bool, error) {
 | |
| 	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
 | |
| 	if err != nil {
 | |
| 		return false, err
 | |
| 	}
 | |
| 
 | |
| 	lock := concurrency.NewMutex(sess, initLockPath)
 | |
| 	log.Info("Trying to acquire lock", log.Ctx{"lock path": initLockPath})
 | |
| 	// Do not use timeout context here, since we don't know how
 | |
| 	// long does it take for other trainers to initialize the
 | |
| 	// parameters.
 | |
| 	err = lock.Lock(context.Background())
 | |
| 	if err != nil {
 | |
| 		return false, err
 | |
| 	}
 | |
| 	log.Info("Successfully acquired lock", log.Ctx{"lock path": initLockPath})
 | |
| 
 | |
| 	get := clientv3.OpGet(initDonePath)
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 | |
| 	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
 | |
| 	cancel()
 | |
| 	if err != nil {
 | |
| 		return false, err
 | |
| 	}
 | |
| 
 | |
| 	if !tresp.Succeeded {
 | |
| 		return false, errors.New("no longer the owner of the lock")
 | |
| 	}
 | |
| 
 | |
| 	resp := tresp.Responses[0].GetResponseRange()
 | |
| 
 | |
| 	if len(resp.Kvs) == 0 {
 | |
| 		// Key value not set, select current trainer.
 | |
| 		e.lock = lock
 | |
| 		log.Info("Trainer selected.")
 | |
| 		return true, nil
 | |
| 	}
 | |
| 
 | |
| 	if string(resp.Kvs[0].Value) == initDoneVal {
 | |
| 		log.Info("Initialization is already done.")
 | |
| 		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
 | |
| 		err = lock.Unlock(ctx)
 | |
| 		cancel()
 | |
| 		if err != nil {
 | |
| 			log.Error("error unlocking", log.Ctx{"error": err})
 | |
| 		}
 | |
| 		return false, nil
 | |
| 	}
 | |
| 
 | |
| 	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
 | |
| }
 | |
| 
 | |
| // Done indicates the parameter initialization process is done.
 | |
| func (e *Etcd) Done() error {
 | |
| 	if e.lock == nil {
 | |
| 		return errors.New("lock is nil, Done called unexpectedly")
 | |
| 	}
 | |
| 
 | |
| 	put := clientv3.OpPut(initDonePath, initDoneVal)
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 | |
| 	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
 | |
| 	cancel()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	if !tresp.Succeeded {
 | |
| 		return errors.New("no longer the owner of the lock")
 | |
| 	}
 | |
| 
 | |
| 	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
 | |
| 	err = e.lock.Unlock(ctx)
 | |
| 	cancel()
 | |
| 	if err != nil {
 | |
| 		log.Error("error unlocking", log.Ctx{"error": err})
 | |
| 	} else {
 | |
| 		e.lock = nil
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Close closes the etcd client.
 | |
| func (e *Etcd) Close() error {
 | |
| 	var err error
 | |
| 	if e.lock != nil {
 | |
| 		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 | |
| 		err = e.lock.Unlock(ctx)
 | |
| 		cancel()
 | |
| 		if err == nil {
 | |
| 			e.lock = nil
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	cErr := e.client.Close()
 | |
| 	if cErr != nil {
 | |
| 		if err != nil {
 | |
| 			log.Error("error closing etcd client", log.Ctx{"error": cErr})
 | |
| 			return err
 | |
| 		}
 | |
| 		return cErr
 | |
| 	}
 | |
| 
 | |
| 	return err
 | |
| }
 |