Merge branch 'develop' of https://github.com/paddlepaddle/paddle into memory_cpu_allocator
commit
67481ca871
@ -1,45 +1,69 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/rpc"
|
"net/rpc"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/namsral/flag"
|
"github.com/namsral/flag"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
"github.com/PaddlePaddle/Paddle/go/master"
|
"github.com/PaddlePaddle/Paddle/go/master"
|
||||||
|
"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
port := flag.Int("port", 8080, "port of the master server.")
|
port := flag.Int("port", 8080, "port of the master server.")
|
||||||
|
ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
|
||||||
faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
|
endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
|
||||||
taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
|
taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
|
||||||
taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
|
taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
|
||||||
chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
|
chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
if *faultTolerance {
|
if *endpoints == "" {
|
||||||
panic("fault tolernance not implemented.")
|
log.Warningln("-endpoints not set, fault tolerance not be enabled.")
|
||||||
|
}
|
||||||
|
|
||||||
|
var store master.Store
|
||||||
|
if *endpoints != "" {
|
||||||
|
eps := strings.Split(*endpoints, ",")
|
||||||
|
ip, err := networkhelper.GetExternalIP()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
addr := fmt.Sprintf("%s:%d", ip, *port)
|
||||||
|
store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
store = &master.InMemStore{}
|
||||||
|
}
|
||||||
|
|
||||||
|
s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
s := master.NewService(*chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
|
err = rpc.Register(s)
|
||||||
err := rpc.Register(s)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
rpc.HandleHTTP()
|
rpc.HandleHTTP()
|
||||||
l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
|
l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = http.Serve(l, nil)
|
err = http.Serve(l, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,144 @@
|
|||||||
|
package master
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/coreos/etcd/clientv3"
|
||||||
|
"github.com/coreos/etcd/clientv3/concurrency"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// DefaultLockPath is the default etcd master lock path.
|
||||||
|
DefaultLockPath = "/master/lock"
|
||||||
|
// DefaultStatePath is the default etcd key for master state.
|
||||||
|
DefaultStatePath = "/master/state"
|
||||||
|
// DefaultAddrPath is the default etcd key for master address.
|
||||||
|
DefaultAddrPath = "/master/addr"
|
||||||
|
)
|
||||||
|
|
||||||
|
// EtcdClient is the etcd client that master uses for fault tolerance
|
||||||
|
// and service registry.
|
||||||
|
type EtcdClient struct {
|
||||||
|
lockPath string
|
||||||
|
statePath string
|
||||||
|
client *clientv3.Client
|
||||||
|
lock *concurrency.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewEtcdClient creates a new EtcdClient.
|
||||||
|
func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
|
||||||
|
log.Debugf("Connecting to etcd at %v", endpoints)
|
||||||
|
// TODO(helin): gracefully shutdown etcd store. Becuase etcd
|
||||||
|
// store holds a etcd lock, even though the lock will expire
|
||||||
|
// when the lease timeout, we need to implement graceful
|
||||||
|
// shutdown to release the lock.
|
||||||
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
Endpoints: endpoints,
|
||||||
|
DialTimeout: dialTimeout,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
lock := concurrency.NewMutex(sess, lockPath)
|
||||||
|
// It's fine for the lock to get stuck, in this case we have
|
||||||
|
// multiple master servers running (only configured to have
|
||||||
|
// one master running, but split-brain problem may cuase
|
||||||
|
// multiple master servers running), and the cluster management
|
||||||
|
// software will kill one of them.
|
||||||
|
log.Debugf("Trying to acquire lock at %s.", lockPath)
|
||||||
|
err = lock.Lock(context.TODO())
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
log.Debugf("Successfully acquired lock at %s.", lockPath)
|
||||||
|
|
||||||
|
put := clientv3.OpPut(addrPath, string(addr))
|
||||||
|
resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !resp.Succeeded {
|
||||||
|
log.Fatal("No longer owns the master lock. Exiting.")
|
||||||
|
}
|
||||||
|
|
||||||
|
e := &EtcdClient{
|
||||||
|
lockPath: lockPath,
|
||||||
|
statePath: statePath,
|
||||||
|
client: cli,
|
||||||
|
lock: lock,
|
||||||
|
}
|
||||||
|
|
||||||
|
return e, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save saves the state into the etcd.
|
||||||
|
func (e *EtcdClient) Save(state []byte) error {
|
||||||
|
ctx := context.TODO()
|
||||||
|
put := clientv3.OpPut(e.statePath, string(state))
|
||||||
|
resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !resp.Succeeded {
|
||||||
|
log.Errorln("No longer owns the lock, trying to lock again")
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
err := e.lock.Lock(ctx)
|
||||||
|
cancel()
|
||||||
|
if err != nil {
|
||||||
|
// We lost the master lock and can not acquire
|
||||||
|
// it back, it means some other master is
|
||||||
|
// already started. We don't want cluster
|
||||||
|
// managment system to kill the master server
|
||||||
|
// who is holding the lock and running
|
||||||
|
// correctly. So the most feasible solution is
|
||||||
|
// to kill current master server. The current
|
||||||
|
// state is not saved, but the trainer's RPC
|
||||||
|
// call will fail, so the trainer will retry.
|
||||||
|
log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
|
||||||
|
}
|
||||||
|
log.Infof("Successfully acquired lock at %s.", e.lockPath)
|
||||||
|
return e.Save(state)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load loads the state from etcd.
|
||||||
|
func (e *EtcdClient) Load() ([]byte, error) {
|
||||||
|
ctx := context.TODO()
|
||||||
|
get := clientv3.OpGet(e.statePath)
|
||||||
|
|
||||||
|
resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !resp.Succeeded {
|
||||||
|
log.Errorln("No longer owns the lock, trying to lock and load again.")
|
||||||
|
err = e.lock.Lock(context.Background())
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return e.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
kvs := resp.Responses[0].GetResponseRange().Kvs
|
||||||
|
if len(kvs) == 0 {
|
||||||
|
// No state exists
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
state := kvs[0].Value
|
||||||
|
return state, nil
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
package master
|
||||||
|
|
||||||
|
import "sync"
|
||||||
|
|
||||||
|
// InMemStore is an in memory implementation of Store interface.
|
||||||
|
//
|
||||||
|
// It does not tolerate the fault that casues the program to crash.
|
||||||
|
type InMemStore struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
buf []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save saves the state into the in-memory store.
|
||||||
|
func (m *InMemStore) Save(state []byte) error {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
|
||||||
|
m.buf = state
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load loads the state from the in-memory store.
|
||||||
|
func (m *InMemStore) Load() ([]byte, error) {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
|
||||||
|
return m.buf, nil
|
||||||
|
}
|
@ -0,0 +1,45 @@
|
|||||||
|
package networkhelper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"net"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GetExternalIP returns the ip address of local network interface, not the
|
||||||
|
// loopback device.
|
||||||
|
func GetExternalIP() (string, error) {
|
||||||
|
ifaces, err := net.Interfaces()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
for _, iface := range ifaces {
|
||||||
|
if iface.Flags&net.FlagUp == 0 {
|
||||||
|
continue // interface down
|
||||||
|
}
|
||||||
|
if iface.Flags&net.FlagLoopback != 0 {
|
||||||
|
continue // loopback interface
|
||||||
|
}
|
||||||
|
addrs, err := iface.Addrs()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
for _, addr := range addrs {
|
||||||
|
var ip net.IP
|
||||||
|
switch v := addr.(type) {
|
||||||
|
case *net.IPNet:
|
||||||
|
ip = v.IP
|
||||||
|
case *net.IPAddr:
|
||||||
|
ip = v.IP
|
||||||
|
}
|
||||||
|
if ip == nil || ip.IsLoopback() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ip = ip.To4()
|
||||||
|
if ip == nil {
|
||||||
|
continue // not an ipv4 address
|
||||||
|
}
|
||||||
|
return ip.String(), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", errors.New("are you connected to the network?")
|
||||||
|
}
|
@ -0,0 +1,10 @@
|
|||||||
|
package networkhelper
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestGetIP(t *testing.T) {
|
||||||
|
_, err := GetExternalIP()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("GetExternalIP returns error : %v\n", err)
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
source ./common.sh
|
|
||||||
|
|
||||||
NPROC=1
|
|
||||||
export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
|
|
||||||
export PYTHONHOME=/opt/python/2.7.12
|
|
||||||
export PATH=/opt/python/2.7.12/bin:${PATH}
|
|
||||||
cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
|
|
||||||
NRPOC=`nproc`
|
|
||||||
make -j $NPROC
|
|
||||||
make coveralls
|
|
||||||
sudo make install
|
|
@ -1,15 +1,18 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Create the build directory for CMake.
|
||||||
|
mkdir -p $TRAVIS_BUILD_DIR/build
|
||||||
|
cd $TRAVIS_BUILD_DIR/build
|
||||||
|
|
||||||
# Add set -e, cd to directory.
|
|
||||||
source ./common.sh
|
|
||||||
# Compile Documentation only.
|
# Compile Documentation only.
|
||||||
cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
|
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF
|
||||||
mkdir output
|
mkdir output
|
||||||
make -j `nproc`
|
make -j `nproc`
|
||||||
find .. -name '*whl' | xargs pip install # install all wheels.
|
find .. -name '*whl' | xargs pip install # install all wheels.
|
||||||
rm -rf *
|
rm -rf *
|
||||||
cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
|
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
|
||||||
make paddle_docs paddle_docs_cn
|
make -j `nproc` paddle_docs paddle_docs_cn
|
||||||
|
|
||||||
# check websites for broken links
|
# check websites for broken links
|
||||||
linkchecker doc/en/html/index.html
|
linkchecker doc/en/html/index.html
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue