Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into scope-impl
commit
cfdfa89bdd
@ -0,0 +1,20 @@
|
||||
INCLUDE(ExternalProject)
|
||||
|
||||
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
|
||||
|
||||
INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
|
||||
|
||||
ExternalProject_Add(
|
||||
eigen3
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
|
||||
URL_MD5 "1a47e78efe365a97de0c022d127607c3"
|
||||
PREFIX ${EIGEN_SOURCE_DIR}
|
||||
UPDATE_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
||||
|
||||
LIST(APPEND external_project_dependencies eigen3)
|
@ -0,0 +1,181 @@
|
||||
package pserver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
"github.com/coreos/etcd/clientv3/concurrency"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// EtcdClient is the etcd client that the pserver uses for fault
|
||||
// tolerance, service registry and coordination.
|
||||
type EtcdClient struct {
|
||||
numPservers int
|
||||
etcdEndpoints string
|
||||
etcdClient *clientv3.Client
|
||||
// etcdTimeout is also used as retry intervals.
|
||||
etcdTimeout time.Duration
|
||||
// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
|
||||
externalIP string
|
||||
// desired number of pservers in the job.
|
||||
// assume desired will not change during one training job.
|
||||
desired int
|
||||
}
|
||||
|
||||
// NewEtcdClient creates an EtcdClient
|
||||
func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
|
||||
return &EtcdClient{
|
||||
etcdTimeout: timeout,
|
||||
numPservers: numPservers,
|
||||
etcdEndpoints: endpoints,
|
||||
}
|
||||
}
|
||||
|
||||
// Register registers the pserver on etcd
|
||||
//
|
||||
// Register returns the index of the current pserver.
|
||||
func (e *EtcdClient) Register() (int, error) {
|
||||
|
||||
var err error
|
||||
e.externalIP, err = networkhelper.GetExternalIP()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// initialize connection to etcd.
|
||||
ep := strings.Split(e.etcdEndpoints, ",")
|
||||
for {
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
Endpoints: ep,
|
||||
DialTimeout: e.etcdTimeout,
|
||||
})
|
||||
if err != nil {
|
||||
log.Errorf("connect to etcd error: %v", err)
|
||||
time.Sleep(e.etcdTimeout)
|
||||
continue
|
||||
}
|
||||
e.etcdClient = cli
|
||||
log.Debugf("inited client to %s", e.etcdEndpoints)
|
||||
break
|
||||
}
|
||||
// init /ps_desired using transaction, for multiple pservers may want to write
|
||||
// it at the same time.
|
||||
for {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
_, err := e.initDesiredPsercers(ctx, e.numPservers)
|
||||
cancel()
|
||||
if err != nil {
|
||||
log.Warn(err)
|
||||
time.Sleep(e.etcdTimeout)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
// TODO: when implementing extending or reducing pservers, /ps_desired is
|
||||
// changed, then we need to watch /ps_desired node for events. For now, just
|
||||
// write once when init and read from it.
|
||||
// wait and set s.desired init value
|
||||
for {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
resp, err := e.etcdClient.Get(ctx, PsDesired)
|
||||
cancel()
|
||||
if err != nil {
|
||||
log.Errorf("getting %s error: %v", PsDesired, err)
|
||||
time.Sleep(e.etcdTimeout)
|
||||
continue
|
||||
}
|
||||
if len(resp.Kvs) != 0 {
|
||||
e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
|
||||
if err != nil {
|
||||
log.Errorf("value of %s invalid %v\n", PsDesired, err)
|
||||
time.Sleep(e.etcdTimeout)
|
||||
// NOTE: wait util ps_desired value change
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
var pserverIdx int
|
||||
// try register pserver node on etcd
|
||||
for {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
var err error
|
||||
pserverIdx, err = e.registerPserverEtcd(ctx)
|
||||
cancel()
|
||||
if err != nil {
|
||||
log.Warn(err)
|
||||
time.Sleep(e.etcdTimeout)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
return pserverIdx, nil
|
||||
}
|
||||
|
||||
func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
|
||||
return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
|
||||
dsStr := c.Get(PsDesired)
|
||||
if dsStr == "" {
|
||||
c.Put(PsDesired, strconv.Itoa(numPservers))
|
||||
}
|
||||
return nil
|
||||
}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
|
||||
}
|
||||
|
||||
// registerPserverEtcd registers pserver node on etcd using transaction.
|
||||
func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
|
||||
var idx int
|
||||
_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
|
||||
registered := false
|
||||
for i := 0; i < e.desired; i++ {
|
||||
psKey := "/ps/" + strconv.Itoa(i)
|
||||
log.Debugf("checking %s", psKey)
|
||||
ps := c.Get(psKey)
|
||||
log.Debugf("got value (%s) for key: %s", ps, psKey)
|
||||
|
||||
if ps == "" {
|
||||
resp, err := e.etcdClient.Grant(context.TODO(), 5)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// find the first id and write info
|
||||
c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
|
||||
log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
|
||||
ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
|
||||
if kaerr != nil {
|
||||
log.Errorf("keepalive etcd node error: %v", kaerr)
|
||||
return kaerr
|
||||
}
|
||||
|
||||
// Eat the keep alive message so etcd
|
||||
// will not expire the lease.
|
||||
go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
|
||||
ka := <-ch
|
||||
log.Debugf("keepalive: %d\n", ka.TTL)
|
||||
}(ch)
|
||||
log.Debug("register finished")
|
||||
idx = i
|
||||
registered = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if registered == true {
|
||||
return nil
|
||||
}
|
||||
return errors.New("not registerd, may due to already have enough pservers")
|
||||
}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
|
||||
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return idx, nil
|
||||
}
|
@ -1,9 +1,7 @@
|
||||
# ddim lib
|
||||
cc_library(ddim SRCS ddim.cc)
|
||||
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
|
||||
|
||||
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
|
||||
|
||||
cc_test(variable_test SRCS variable_test.cc)
|
||||
|
||||
cc_test(scope_test SRCS scope_test.cc)
|
||||
cc_test(enforce_test SRCS enforce_test.cc)
|
||||
|
@ -0,0 +1,69 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include <paddle/string/printf.h>
|
||||
#include <exception>
|
||||
#include <sstream>
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
|
||||
/**
|
||||
* @brief Enforce exception. Inherits std::exception
|
||||
*
|
||||
* All enforce condition not met, will throw an EnforceNotMet exception.
|
||||
*/
|
||||
class EnforceNotMet : public std::exception {
|
||||
public:
|
||||
EnforceNotMet(const std::string& msg, const char* file, int fileline) {
|
||||
std::ostringstream sout;
|
||||
sout << msg << " at [" << file << ":" << fileline << "];";
|
||||
all_msg_ = sout.str();
|
||||
}
|
||||
|
||||
const char* what() const noexcept override { return all_msg_.c_str(); }
|
||||
|
||||
private:
|
||||
std::string all_msg_;
|
||||
};
|
||||
|
||||
// From https://stackoverflow.com/questions/30130930/
|
||||
// __buildin_expect is in C++ 11 standard. Since the condition which enforced
|
||||
// should be true in most situation, it will make the compiler generate faster
|
||||
// code by adding `UNLIKELY` macro.
|
||||
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
|
||||
|
||||
/**
|
||||
* @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
|
||||
* __LINE__
|
||||
*
|
||||
* This macro take __VA_ARGS__, user can pass any type if that type can
|
||||
* serialize to std::ostream
|
||||
*/
|
||||
#define PADDLE_THROW(...) \
|
||||
do { \
|
||||
throw ::paddle::framework::EnforceNotMet( \
|
||||
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* @brief Enforce a condition, otherwise throw an EnforceNotMet
|
||||
*/
|
||||
#define PADDLE_ENFORCE(condition, ...) \
|
||||
do { \
|
||||
if (UNLIKELY(!(condition))) { \
|
||||
PADDLE_THROW(__VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
@ -0,0 +1,35 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <paddle/framework/enforce.h>
|
||||
|
||||
TEST(ENFORCE, OK) {
|
||||
PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
|
||||
size_t val = 1;
|
||||
const size_t limit = 10;
|
||||
PADDLE_ENFORCE(val < limit, "Enforce is OK too");
|
||||
}
|
||||
|
||||
TEST(ENFORCE, FAILED) {
|
||||
bool in_catch = false;
|
||||
try {
|
||||
PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
|
||||
} catch (paddle::framework::EnforceNotMet err) {
|
||||
in_catch = true;
|
||||
std::string msg = "Enforce is not ok 123 at all";
|
||||
const char* what = err.what();
|
||||
for (size_t i = 0; i < msg.length(); ++i) {
|
||||
ASSERT_EQ(what[i], msg[i]);
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(in_catch);
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
Standard: Cpp11
|
||||
...
|
@ -0,0 +1 @@
|
||||
add_subdirectory(detail)
|
@ -0,0 +1,7 @@
|
||||
if(${WITH_GPU})
|
||||
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
|
||||
nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
|
||||
else(${WITH_GPU})
|
||||
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
|
||||
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
|
||||
endif(${WITH_GPU})
|
@ -0,0 +1,35 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/memory/detail/buddy_allocator.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace memory {
|
||||
namespace detail {
|
||||
|
||||
BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
|
||||
SystemAllocator* system_allocator)
|
||||
: pool_size_(pool_size),
|
||||
max_pools_(max_pools),
|
||||
system_allocator_(system_allocator) {
|
||||
PADDLE_ASSERT(pool_size > 0);
|
||||
PADDLE_ASSERT(max_pools > 0);
|
||||
PADDLE_ASSERT(system_allocator != nullptr);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace memory
|
||||
} // namespace paddle
|
@ -0,0 +1,86 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/memory/detail/system_allocator.h"
|
||||
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
namespace paddle {
|
||||
namespace memory {
|
||||
namespace detail {
|
||||
|
||||
class BuddyAllocator {
|
||||
public:
|
||||
BuddyAllocator(size_t pool_size, size_t max_pools,
|
||||
SystemAllocator* system_allocator);
|
||||
~BuddyAllocator();
|
||||
|
||||
void* Alloc(size_t size);
|
||||
void Free(void*);
|
||||
size_t Used();
|
||||
|
||||
private:
|
||||
struct Block {
|
||||
size_t size_;
|
||||
Block* left_; // left buddy
|
||||
Block* right_; // right buddy
|
||||
};
|
||||
|
||||
// Initially, there is only one pool. If a Alloc founds not enough
|
||||
// memory from that pool, and there has not been max_num_pools_,
|
||||
// create a new pool by calling system_allocator_.Alloc(pool_size_).
|
||||
std::vector<void*> pools_;
|
||||
|
||||
size_t pool_size_; // the size of each pool;
|
||||
size_t max_num_pools_; // the size of all pools;
|
||||
|
||||
SystemAllocator* system_allocator_;
|
||||
|
||||
std::mutex mutex_;
|
||||
|
||||
// Disable copy and assignment.
|
||||
BuddyAllocator(const BuddyAllocator&) = delete;
|
||||
BuddyAllocator& operator=(const BuddyAllocator&) = delete;
|
||||
};
|
||||
|
||||
BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
|
||||
static BuddyAllocator<CPUAllocator>* a = nullptr;
|
||||
if (a == nullptr) {
|
||||
a = new BuddyAllocator<CPUAllocator>();
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
#ifndef PADDLE_ONLY_CPU // The following code are for CUDA.
|
||||
|
||||
BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
|
||||
static BuddyAllocator<GPUAllocator>** as = NULL;
|
||||
if (as == NULL) {
|
||||
int gpu_num = platform::GetDeviceCount();
|
||||
as = new BuddyAllocator<GPUAllocator>*[gpu_num];
|
||||
for (int gpu = 0; gpu < gpu_num; gpu++) {
|
||||
as[gpu] = new BuddyAllocator<GPUAllocator>();
|
||||
}
|
||||
}
|
||||
return as[gpu_id];
|
||||
}
|
||||
|
||||
#endif // PADDLE_ONLY_CPU
|
||||
|
||||
} // namespace detail
|
||||
} // namespace memory
|
||||
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue