commit
e4c7d8cc2b
@ -1,45 +1,69 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/rpc"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/namsral/flag"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/PaddlePaddle/Paddle/go/master"
|
||||
"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
|
||||
)
|
||||
|
||||
func main() {
|
||||
port := flag.Int("port", 8080, "port of the master server.")
|
||||
|
||||
faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
|
||||
ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
|
||||
endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
|
||||
taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
|
||||
taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
|
||||
chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
|
||||
flag.Parse()
|
||||
|
||||
if *faultTolerance {
|
||||
panic("fault tolernance not implemented.")
|
||||
if *endpoints == "" {
|
||||
log.Warningln("-endpoints not set, fault tolerance not be enabled.")
|
||||
}
|
||||
|
||||
var store master.Store
|
||||
if *endpoints != "" {
|
||||
eps := strings.Split(*endpoints, ",")
|
||||
ip, err := networkhelper.GetExternalIP()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
addr := fmt.Sprintf("%s:%d", ip, *port)
|
||||
store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
} else {
|
||||
store = &master.InMemStore{}
|
||||
}
|
||||
|
||||
s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
s := master.NewService(*chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
|
||||
err := rpc.Register(s)
|
||||
err = rpc.Register(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
rpc.HandleHTTP()
|
||||
l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
err = http.Serve(l, nil)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,144 @@
|
||||
package master
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
"github.com/coreos/etcd/clientv3/concurrency"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
// DefaultLockPath is the default etcd master lock path.
|
||||
DefaultLockPath = "/master/lock"
|
||||
// DefaultStatePath is the default etcd key for master state.
|
||||
DefaultStatePath = "/master/state"
|
||||
// DefaultAddrPath is the default etcd key for master address.
|
||||
DefaultAddrPath = "/master/addr"
|
||||
)
|
||||
|
||||
// EtcdClient is the etcd client that master uses for fault tolerance
|
||||
// and service registry.
|
||||
type EtcdClient struct {
|
||||
lockPath string
|
||||
statePath string
|
||||
client *clientv3.Client
|
||||
lock *concurrency.Mutex
|
||||
}
|
||||
|
||||
// NewEtcdClient creates a new EtcdClient.
|
||||
func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
|
||||
log.Debugf("Connecting to etcd at %v", endpoints)
|
||||
// TODO(helin): gracefully shutdown etcd store. Becuase etcd
|
||||
// store holds a etcd lock, even though the lock will expire
|
||||
// when the lease timeout, we need to implement graceful
|
||||
// shutdown to release the lock.
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
Endpoints: endpoints,
|
||||
DialTimeout: dialTimeout,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
lock := concurrency.NewMutex(sess, lockPath)
|
||||
// It's fine for the lock to get stuck, in this case we have
|
||||
// multiple master servers running (only configured to have
|
||||
// one master running, but split-brain problem may cuase
|
||||
// multiple master servers running), and the cluster management
|
||||
// software will kill one of them.
|
||||
log.Debugf("Trying to acquire lock at %s.", lockPath)
|
||||
err = lock.Lock(context.TODO())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
log.Debugf("Successfully acquired lock at %s.", lockPath)
|
||||
|
||||
put := clientv3.OpPut(addrPath, string(addr))
|
||||
resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !resp.Succeeded {
|
||||
log.Fatal("No longer owns the master lock. Exiting.")
|
||||
}
|
||||
|
||||
e := &EtcdClient{
|
||||
lockPath: lockPath,
|
||||
statePath: statePath,
|
||||
client: cli,
|
||||
lock: lock,
|
||||
}
|
||||
|
||||
return e, nil
|
||||
}
|
||||
|
||||
// Save saves the state into the etcd.
|
||||
func (e *EtcdClient) Save(state []byte) error {
|
||||
ctx := context.TODO()
|
||||
put := clientv3.OpPut(e.statePath, string(state))
|
||||
resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !resp.Succeeded {
|
||||
log.Errorln("No longer owns the lock, trying to lock again")
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
err := e.lock.Lock(ctx)
|
||||
cancel()
|
||||
if err != nil {
|
||||
// We lost the master lock and can not acquire
|
||||
// it back, it means some other master is
|
||||
// already started. We don't want cluster
|
||||
// managment system to kill the master server
|
||||
// who is holding the lock and running
|
||||
// correctly. So the most feasible solution is
|
||||
// to kill current master server. The current
|
||||
// state is not saved, but the trainer's RPC
|
||||
// call will fail, so the trainer will retry.
|
||||
log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
|
||||
}
|
||||
log.Infof("Successfully acquired lock at %s.", e.lockPath)
|
||||
return e.Save(state)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Load loads the state from etcd.
|
||||
func (e *EtcdClient) Load() ([]byte, error) {
|
||||
ctx := context.TODO()
|
||||
get := clientv3.OpGet(e.statePath)
|
||||
|
||||
resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !resp.Succeeded {
|
||||
log.Errorln("No longer owns the lock, trying to lock and load again.")
|
||||
err = e.lock.Lock(context.Background())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return e.Load()
|
||||
}
|
||||
|
||||
kvs := resp.Responses[0].GetResponseRange().Kvs
|
||||
if len(kvs) == 0 {
|
||||
// No state exists
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
state := kvs[0].Value
|
||||
return state, nil
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package master
|
||||
|
||||
import "sync"
|
||||
|
||||
// InMemStore is an in memory implementation of Store interface.
|
||||
//
|
||||
// It does not tolerate the fault that casues the program to crash.
|
||||
type InMemStore struct {
|
||||
mu sync.Mutex
|
||||
buf []byte
|
||||
}
|
||||
|
||||
// Save saves the state into the in-memory store.
|
||||
func (m *InMemStore) Save(state []byte) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
m.buf = state
|
||||
return nil
|
||||
}
|
||||
|
||||
// Load loads the state from the in-memory store.
|
||||
func (m *InMemStore) Load() ([]byte, error) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
return m.buf, nil
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package networkhelper
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net"
|
||||
)
|
||||
|
||||
// GetExternalIP returns the ip address of local network interface, not the
|
||||
// loopback device.
|
||||
func GetExternalIP() (string, error) {
|
||||
ifaces, err := net.Interfaces()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, iface := range ifaces {
|
||||
if iface.Flags&net.FlagUp == 0 {
|
||||
continue // interface down
|
||||
}
|
||||
if iface.Flags&net.FlagLoopback != 0 {
|
||||
continue // loopback interface
|
||||
}
|
||||
addrs, err := iface.Addrs()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, addr := range addrs {
|
||||
var ip net.IP
|
||||
switch v := addr.(type) {
|
||||
case *net.IPNet:
|
||||
ip = v.IP
|
||||
case *net.IPAddr:
|
||||
ip = v.IP
|
||||
}
|
||||
if ip == nil || ip.IsLoopback() {
|
||||
continue
|
||||
}
|
||||
ip = ip.To4()
|
||||
if ip == nil {
|
||||
continue // not an ipv4 address
|
||||
}
|
||||
return ip.String(), nil
|
||||
}
|
||||
}
|
||||
return "", errors.New("are you connected to the network?")
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package networkhelper
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestGetIP(t *testing.T) {
|
||||
_, err := GetExternalIP()
|
||||
if err != nil {
|
||||
t.Errorf("GetExternalIP returns error : %v\n", err)
|
||||
}
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
/*
|
||||
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <typeindex>
|
||||
#include <typeinfo>
|
||||
|
||||
#include "paddle/platform/assert.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
|
||||
class Variable {
|
||||
public:
|
||||
template <typename T>
|
||||
const T& Get() const {
|
||||
PADDLE_ASSERT(holder_ != nullptr);
|
||||
PADDLE_ASSERT(std::type_index(typeid(T)) ==
|
||||
std::type_index(holder_->Type()));
|
||||
return *static_cast<const T*>(holder_->Ptr());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* GetMutable() {
|
||||
if (holder_ == nullptr ||
|
||||
std::type_index(typeid(T)) != std::type_index(holder_->Type())) {
|
||||
holder_.reset(new PlaceholderImpl<T>(new T()));
|
||||
}
|
||||
return static_cast<T*>(holder_->Ptr());
|
||||
}
|
||||
|
||||
private:
|
||||
struct Placeholder {
|
||||
virtual ~Placeholder() {}
|
||||
virtual const std::type_info& Type() const = 0;
|
||||
virtual void* Ptr() const = 0;
|
||||
};
|
||||
|
||||
// Placeholder hides type T, so it doesn't appear as a template
|
||||
// parameter of Variable.
|
||||
template <typename T>
|
||||
struct PlaceholderImpl : public Placeholder {
|
||||
PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
|
||||
|
||||
virtual const std::type_info& Type() const { return type_; }
|
||||
virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
|
||||
|
||||
std::unique_ptr<T> ptr_;
|
||||
const std::type_info& type_;
|
||||
};
|
||||
|
||||
std::unique_ptr<Placeholder>
|
||||
holder_; // pointers to a PlaceholderImpl object indeed.
|
||||
};
|
||||
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
@ -0,0 +1,52 @@
|
||||
# Design Doc: Variable
|
||||
|
||||
|
||||
Variable is also known as *blob* in MxNet and Caffe2. It is the input and output type of operators, where a neural network is a graph of operators.
|
||||
|
||||
## Requirements: Lazy Memory Allocation
|
||||
|
||||
For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
|
||||
|
||||
To use the minimum amount of memory, we'd like that a variable to allocate memory when it has to, or, lazy memory allocation. Let's take the following example:
|
||||
|
||||
```cpp
|
||||
Variable vr, v1, v2;
|
||||
|
||||
Tensor* t1 = new Tensor();
|
||||
Tensor* t2 = new Tensor();
|
||||
|
||||
Randomize(
|
||||
/* malloc */ v1.GetMutable<Tensor>().mutable_data<float16>(DDim(100,200)),
|
||||
/* size */ t1.Size());
|
||||
|
||||
Randomize(
|
||||
/* malloc */ v2.GetMutable<Tensor>().mutable_data<float16>(DDim(200,300)),
|
||||
/* size */ t2.Size());
|
||||
|
||||
Mult(
|
||||
/*result*/ vr.GetMutable<Tensor>().mutable_data<v1.Type()>(SizeOfMult(v1, v2)),
|
||||
/*input1*/ v1.Get<Tensor>().data(),
|
||||
/*input2*/ v2.Get<Tensor>().data());
|
||||
```
|
||||
|
||||
We see that a variable holds nothing until `Variable::GetMutable<Tensor>()` allocates a tensor and puts it in the variable. Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
|
||||
|
||||
This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
|
||||
|
||||
|
||||
## Implementation: Type Hiding
|
||||
|
||||
To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time. In other words, `class Variable` cannot be a template `template <T> class Variable`.
|
||||
|
||||
Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member. Instead, we save an interface object `Placeholder`, who can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
|
||||
|
||||
But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
|
||||
|
||||
We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`. Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
|
||||
|
||||
Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
|
||||
|
||||
|
||||
## Conclusion
|
||||
|
||||
The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid). This combination saves us from definition something like `caffe2::TypeMata`, which takes hundreds of lines of C++ code.
|
@ -0,0 +1,40 @@
|
||||
/*
|
||||
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "paddle/framework/variable.h"
|
||||
|
||||
TEST(Variable, GetMutable) {
|
||||
using paddle::framework::Variable;
|
||||
|
||||
struct Tensor {
|
||||
int content_;
|
||||
};
|
||||
|
||||
std::unique_ptr<Variable> v(new Variable());
|
||||
|
||||
Tensor* t = v->GetMutable<Tensor>();
|
||||
t->content_ = 1234;
|
||||
|
||||
const Tensor& tt = v->Get<Tensor>();
|
||||
EXPECT_EQ(1234, tt.content_);
|
||||
|
||||
std::string* s = v->GetMutable<std::string>();
|
||||
*s = "hello";
|
||||
|
||||
const std::string& ss = v->Get<std::string>();
|
||||
EXPECT_EQ("hello", ss);
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue