Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into taskfail
commit
7663a40c88
@ -1,28 +1,48 @@
|
|||||||
| Github account | name |
|
| Github account | name |
|
||||||
|---|---|
|
|---|---|
|
||||||
| reyoung | Yang Yu |
|
| backyes | Yan-Fei Wang |
|
||||||
|
| beckett1124 | Bin Qi |
|
||||||
|
| Canpio | Jia-Yi Feng |
|
||||||
|
| chengxiaohua1105 | Xiao-Hua Cheng |
|
||||||
|
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
|
||||||
|
| cxysteven | Xing-Yi Cheng |
|
||||||
|
| dzhwinter | Zhi-Hong Dong |
|
||||||
|
| emailweixu | Wei Xu |
|
||||||
| gangliao | Gang Liao |
|
| gangliao | Gang Liao |
|
||||||
| luotao01 | Tao Luo |
|
| gongweibao | Wei-Bao Gong |
|
||||||
| jacquesqiao | Long-Fei Qiao |
|
| Guo Sheng | Sheng Guo |
|
||||||
| qingqing01 | Qing-Qing Dang |
|
| Haichao-Zhang | Hai-Chao Zhang |
|
||||||
| hedaoyuan | Dao-Yuan He |
|
| hedaoyuan | Dao-Yuan He |
|
||||||
| wangyang59 | Yang Wang |
|
| helinwang | He-Lin Wang |
|
||||||
|
| jacquesqiao | Long-Fei Qiao |
|
||||||
|
| kuke | Yi-Bing Liu |
|
||||||
|
| lcy-seso | Ying Cao |
|
||||||
|
| lipeng-unisound | Peng Li |
|
||||||
|
| liuyuan | Yuan Liu |
|
||||||
|
| livc | Zhao Li |
|
||||||
|
| llxxxll | Yong-Feng Liu |
|
||||||
|
| luotao01 | Tao Luo |
|
||||||
|
| lzhao4ever | Liang Zhao |
|
||||||
|
| NHZlX | Zhao-Long Xing |
|
||||||
|
| pakchoi | Chuan-Jiang Song |
|
||||||
|
| pengli09 | Peng Li |
|
||||||
|
| pkuyym | Ya-Ming Yang |
|
||||||
| QiJune | Jun Qi |
|
| QiJune | Jun Qi |
|
||||||
|
| qingqing01 | Qing-Qing Dang |
|
||||||
|
| reyoung | Yang Yu |
|
||||||
|
| Superjom | Chun-Wei Yan |
|
||||||
| tianbingsz | Tian-Bing Xu |
|
| tianbingsz | Tian-Bing Xu |
|
||||||
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
|
|
||||||
| typhoonzero | Yi Wu |
|
| typhoonzero | Yi Wu |
|
||||||
| backyes | Yan-Fei Wang |
|
| wanghaoshuang | Hao-Shuang Wang |
|
||||||
| pengli09 | Peng Li |
|
| wangyang59 | Yang Wang |
|
||||||
| livc | Zhao Li |
|
| wangzhen-nlp | Zhen Wang |
|
||||||
|
| wen-bo-yang | Wen-Bo Yang |
|
||||||
|
| wwhu | Wei-Wei Hu |
|
||||||
|
| xinghai-sun | Xing-Hai Sun |
|
||||||
| Xreki | Yi-Qun Liu |
|
| Xreki | Yi-Qun Liu |
|
||||||
|
| xujun05 | Jun Xu |
|
||||||
|
| xushaoyong | Shao-Yong Xu |
|
||||||
| Yancey1989 | Xu Yan |
|
| Yancey1989 | Xu Yan |
|
||||||
| emailweixu | Wei Xu |
|
| zhaopu7 | Pu Zhao |
|
||||||
| wen-bo-yang | Wen-Bo Yang |
|
|
||||||
| helinwang | He-Lin Wang |
|
|
||||||
| lcy-seso | Ying Cao |
|
|
||||||
| Zrachel | Rui-Qing Zhang |
|
|
||||||
| Haichao-Zhang | Hai-Chao Zhang |
|
|
||||||
| gongweibao | Wei-Bao Gong |
|
|
||||||
| lzhao4ever | Liang Zhao |
|
|
||||||
| zhouxiao-coder | Xiao Zhou |
|
| zhouxiao-coder | Xiao Zhou |
|
||||||
| lipeng-unisound | Peng Li |
|
| Zrachel | Rui-Qing Zhang |
|
||||||
|
@ -1 +1 @@
|
|||||||
go_library(paddle_master SHARED)
|
go_library(paddle_master SHARED DEPS paddle_go_optimizer)
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
|
|
||||||
go_library(paddle_pserver_cclient STATIC)
|
|
||||||
if(WITH_TESTING)
|
|
||||||
add_subdirectory(test)
|
|
||||||
endif()
|
|
@ -0,0 +1,6 @@
|
|||||||
|
cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
|
||||||
|
go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
|
||||||
|
if(WITH_TESTING)
|
||||||
|
# TODO: add unit test
|
||||||
|
#add_subdirectory(test)
|
||||||
|
endif()
|
@ -1,2 +1,2 @@
|
|||||||
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
|
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
|
||||||
add_style_check_target(test_cclient test_cclient.c)
|
add_style_check_target(test_cclient test_cclient.c)
|
@ -0,0 +1,125 @@
|
|||||||
|
package client
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/PaddlePaddle/Paddle/go/pserver"
|
||||||
|
"github.com/coreos/etcd/clientv3"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
DefaultEtcdTimeout time.Duration = 5 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
// EtcdClient is used by pserver client that is a part of trainer process.
|
||||||
|
// TODO:
|
||||||
|
// 1. add watcher to watch the change state of pservers)
|
||||||
|
// 1. add etcd lock)
|
||||||
|
type EtcdClient struct {
|
||||||
|
client *clientv3.Client
|
||||||
|
timeout time.Duration
|
||||||
|
endpoints []string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Desired read ps desired number from etcd.
|
||||||
|
func (p *EtcdClient) Desired() int {
|
||||||
|
var psDesired int
|
||||||
|
for {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
|
||||||
|
resp, err := p.client.Get(ctx, pserver.PsDesired)
|
||||||
|
cancel()
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
|
||||||
|
time.Sleep(p.timeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
kvs := resp.Kvs
|
||||||
|
if len(kvs) == 0 {
|
||||||
|
log.Infoln("Waiting for ps desired registered ...")
|
||||||
|
time.Sleep(p.timeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("psDesired %s invalid %v", psDesired, err)
|
||||||
|
time.Sleep(p.timeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Debugf("Get psDesired number: %d", psDesired)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
return psDesired
|
||||||
|
}
|
||||||
|
|
||||||
|
// List return the pserver list read from etcd.
|
||||||
|
func (p *EtcdClient) List() []Server {
|
||||||
|
psDesired := p.Desired()
|
||||||
|
|
||||||
|
servers := make([]Server, psDesired)
|
||||||
|
for {
|
||||||
|
for i := 0; i < psDesired; i++ {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
|
||||||
|
cancel()
|
||||||
|
psKey := pserver.PsPath + strconv.Itoa(i)
|
||||||
|
log.Debugf("checking %s", psKey)
|
||||||
|
resp, err := p.client.Get(ctx, psKey)
|
||||||
|
if err != nil {
|
||||||
|
log.Infof("Get psKey= %s error, %v", psKey, err)
|
||||||
|
time.Sleep(p.timeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
kvs := resp.Kvs
|
||||||
|
if len(kvs) == 0 {
|
||||||
|
log.Infof("Waiting for ps addr registered ...")
|
||||||
|
time.Sleep(p.timeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
psAddr := string(resp.Kvs[0].Value)
|
||||||
|
// TODO(Longfei) check the ps address
|
||||||
|
if psAddr == "" {
|
||||||
|
log.Infof("Get psKey = %s, psAddr is empty", psKey)
|
||||||
|
time.Sleep(p.timeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
log.Infof("got value (%s) for key: %s", psAddr, psKey)
|
||||||
|
servers[i].Index = i
|
||||||
|
servers[i].Addr = psAddr
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
return servers
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewEtcd create a etcd client to return the state of pserver on etcd.
|
||||||
|
func NewEtcd(endpoints string) *EtcdClient {
|
||||||
|
ep := strings.Split(endpoints, ",")
|
||||||
|
var cli *clientv3.Client
|
||||||
|
var err error
|
||||||
|
for {
|
||||||
|
cli, err = clientv3.New(clientv3.Config{
|
||||||
|
Endpoints: ep,
|
||||||
|
DialTimeout: DefaultEtcdTimeout,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Init etcd connection failed: %v", err)
|
||||||
|
time.Sleep(DefaultEtcdTimeout)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
log.Infof("Connected to etcd: %s\n", endpoints)
|
||||||
|
client := &EtcdClient{
|
||||||
|
client: cli,
|
||||||
|
timeout: DefaultEtcdTimeout,
|
||||||
|
endpoints: ep,
|
||||||
|
}
|
||||||
|
return client
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue