Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

revert-13637-optimize-opyreader
Dang Qingqing 7 years ago
commit ba8ba3000b

@ -20,41 +20,79 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
template <class T> // Change it to thread safe flags if needed.
class COWPtr { class ThreadUnsafeOwnershipFlags {
public: public:
typedef std::shared_ptr<T> RefPtr; explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
private: ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
RefPtr m_sp; ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void detach() { void SetOwnership(bool flag) { flag_ = flag; }
T* tmp = m_sp.get();
if (!(tmp == nullptr || m_sp.unique())) { // Invoke the callback if it is not owned.
m_sp = RefPtr(new T(*tmp)); template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
} }
} }
public: private:
COWPtr() : m_sp(nullptr) {} bool flag_;
explicit COWPtr(T* t) : m_sp(t) {} };
explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {}
const T& Data() const { return operator*(); } // Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public:
// Ctor from raw pointer.
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
T* MutableData() { return operator->(); } // Move methods. Steal ownership from origin
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
const T& operator*() const { return *m_sp; } // Copy methods. Not own payload
T& operator*() { COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
detach(); COWPtr& operator=(const COWPtr& other) {
return *m_sp; payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
} }
const T* operator->() const { return m_sp.operator->(); }
T* operator->() { // Access read only data.
detach(); const T& Data() const { return *payload_; }
return m_sp.operator->();
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() {
ownership_.AcquireOwnershipOnce(
[this] { payload_.reset(new T(*payload_)); });
return payload_.get();
} }
private:
// Actual data pointer.
std::shared_ptr<T> payload_;
// Ownership flag.
OwnershipFlags ownership_;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -30,14 +30,6 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10); ASSERT_EQ(ptr2.Data(), 10);
} }
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
return recv_vars; return recv_vars;
} }
bool MultiDevSSAGraphBuilder::IsDistTrainOp(
ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const {
if (send_vars.size() == 0 || recv_vars.size() == 0) {
return false;
}
/**
* Check any of opvars contains `.block` and in sendvars
*/
auto checker = [](const std::vector<std::string> &opvars,
const std::vector<std::string> &rpc_vars) -> bool {
for (auto &var : opvars) {
// a variable name with the suffix `.block` means it's a splited
// variable by (DistributeTranspiler)
// [python/paddle/fluid/transpiler/distribute_transpiler.py]
if (var.find(".block") != std::string::npos &&
std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
return true;
}
}
return false;
};
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
}
for (ir::Node *output : node->outputs) {
output_var_names.push_back(output->Name());
}
return checker(output_var_names, send_vars) ||
checker(input_var_names, recv_vars);
}
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const { const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0; int64_t numel_sum = 0;
@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
} }
} }
is_dist_train = true; is_dist_train = true;
} else if (IsDistTrainOp(node, send_vars, recv_vars)) { } else if (boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kDist)) {
int op_dev_id = CreateDistTrainOp(&result, node); int op_dev_id = CreateDistTrainOp(&result, node);
if (node->Op()->Type() == "concat") { if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0]; auto origin_param_name = node->Op()->OutputArgumentNames()[0];
@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
.emplace(varname, op_dev_id); .emplace(varname, op_dev_id);
} }
} else { } else {
LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
PADDLE_THROW( PADDLE_THROW(
"the distribute training related op should be in [split_byref, " "the distribute training related op should be in [split_byref, "
"concat]."); "concat].");

@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
int CreateRPCOp(ir::Graph *result, ir::Node *node) const; int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
/**
* Is this operator as the end-point operator before/after send operator.
*/
bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const;
std::vector<std::string> FindDistTrainSendVars( std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) const; const std::vector<ir::Node *> &nodes) const;

File diff suppressed because it is too large Load Diff

@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
{static_cast<int>(OpRole::kForward), {static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kBackward), static_cast<int>(OpRole::kBackward),
static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC), static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
static_cast<int>(OpRole::kDist), static_cast<int>(OpRole::kLRSched),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward), static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kLoss) |
static_cast<int>(OpRole::kBackward), static_cast<int>(OpRole::kBackward),

@ -26,7 +26,13 @@ enum class OpRole {
kForward = 0x0000, kForward = 0x0000,
kBackward = 0x0001, kBackward = 0x0001,
kOptimize = 0x0002, kOptimize = 0x0002,
// RPC role is for send/recv releated op
kRPC = 0x0003, kRPC = 0x0003,
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
kDist = 0x0004,
// Tag all learning rate scheduler operators.
kLRSched = 0x0005,
kLoss = 0x0100, kLoss = 0x0100,
// The default value of op's role. This should be only used for unittests and // The default value of op's role. This should be only used for unittests and

@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type")); auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
auto& label_lod = in_label->lod(); auto label_lod = in_label->lod();
auto& detect_lod = in_detect->lod(); auto detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect); auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto& label_lod = input_label.lod(); auto label_lod = input_label.lod();
auto& detect_lod = input_detect.lod(); auto detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1; int batch_size = label_lod[0].size() - 1;
auto& label_index = label_lod[0]; auto label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes; std::map<int, std::vector<Box>> boxes;
@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto& pos_data_lod = pos_tensor.lod()[0]; auto pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto& image_gt_boxes = gt_boxes[n]; auto image_gt_boxes = gt_boxes[n];
for (auto& image_gt_box : image_gt_boxes) { for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
size_t count = 0; size_t count = 0;
auto& labeled_bboxes = image_gt_box.second; auto labeled_bboxes = it->second;
if (evaluate_difficult) { if (evaluate_difficult) {
count = labeled_bboxes.size(); count = labeled_bboxes.size();
} else { } else {
for (auto& box : labeled_bboxes) { for (size_t i = 0; i < labeled_bboxes.size(); ++i)
if (!box.is_difficult) { if (!(labeled_bboxes[i].is_difficult)) ++count;
++count;
}
}
} }
if (count == 0) { if (count == 0) {
continue; continue;
} }
int label = image_gt_box.first; int label = it->first;
if (label_pos_count->find(label) == label_pos_count->end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count; (*label_pos_count)[label] = count;
} else { } else {

@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData(
::google::protobuf::io::CodedInputStream* input, ::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, const framework::DDim& dims, const platform::DeviceContext& ctx, const framework::DDim& dims,
int length) { int length) {
auto server_var = GetVar();
if (!server_var) {
LOG(ERROR) << "recved var should not on current server: "
<< meta_.varname();
return false;
}
auto* tensor = GetVar()->GetMutable<framework::LoDTensor>(); auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
tensor->Resize(dims); tensor->Resize(dims);
framework::LoD lod; framework::LoD lod;
for (int i = 0; i < meta_.lod_level(); ++i) { for (int i = 0; i < meta_.lod_level(); ++i) {
framework::Vector<size_t> v; framework::Vector<size_t> v;
@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData(
void* tensor_data = void* tensor_data =
tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false; return false;
} }

@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>(); auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto &in_rows = in.rows(); auto in_rows = in.rows();
auto out_dim = framework::make_ddim( auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1}); std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place()); auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());

@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data, memory::Copy(
boost::get<platform::CUDAPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(out_place), out_data,
in1_value.numel() * sizeof(T), context.stream()); boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@ -107,7 +109,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
@ -206,7 +208,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);

@ -20,7 +20,9 @@ limitations under the License. */
TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add) {
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDADeviceContext ctx(gpu_place); paddle::platform::CUDADeviceContext& ctx =
*reinterpret_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext, paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
float> float>
functor; functor;
@ -132,7 +134,9 @@ TEST(selected_rows_functor, gpu_add) {
TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_add_to) {
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDADeviceContext ctx(gpu_place); paddle::platform::CUDADeviceContext& ctx =
*reinterpret_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext, paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
float> float>
functor; functor;

@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> {
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {

@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) {
.value("Backward", framework::OpRole::kBackward) .value("Backward", framework::OpRole::kBackward)
.value("Optimize", framework::OpRole::kOptimize) .value("Optimize", framework::OpRole::kOptimize)
.value("Loss", framework::OpRole::kLoss) .value("Loss", framework::OpRole::kLoss)
.value("RPC", framework::OpRole::kRPC); .value("RPC", framework::OpRole::kRPC)
.value("Dist", framework::OpRole::kDist)
.value("LRSched", framework::OpRole::kLRSched);
op_proto_and_checker_maker.def( op_proto_and_checker_maker.def(
"kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName); "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);

@ -1509,6 +1509,30 @@ class Program(object):
self._op_role_var = [] self._op_role_var = []
self._current_role = OpRole.Forward self._current_role = OpRole.Forward
@contextlib.contextmanager
def _lr_schedule_guard(self):
"""
A with guard to set :code:`LRSched` :code:`OpRole` and
:code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
set to the target learning rate.
Notes: This is a very low level API. Users should not use it directly.
Examples:
>>> p, g = backward(...)
>>> with program.lr_schedule_guard():
>>> lr = lr * decay
"""
OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.LRSched
# TODO(typhoonzero): how to set target learning rate var
self._op_role_var = []
yield
self._op_role_var = []
self._current_role = OpRole.Forward
def __str__(self): def __str__(self):
""" """
Get the protobuf debug string of this Program. Get the protobuf debug string of this Program.

@ -74,7 +74,7 @@ class Initializer(object):
directly, but need to use one of its implementations. directly, but need to use one of its implementations.
""" """
def __init_(self): def __init__(self):
pass pass
def __call__(self, param, block): def __call__(self, param, block):
@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer):
assert loc is not None assert loc is not None
assert scale is not None assert scale is not None
assert seed is not None assert seed is not None
super(NormalInitializer, self).__init__() super(TruncatedNormalInitializer, self).__init__()
self._mean = loc self._mean = loc
self._std_dev = scale self._std_dev = scale
self._seed = seed self._seed = seed

@ -27,7 +27,7 @@ from . import nn
from . import ops from . import ops
from . import tensor from . import tensor
from ..initializer import init_on_cpu from ..initializer import init_on_cpu
from ..framework import default_main_program, Parameter from ..framework import default_main_program, Parameter, unique_name
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@ -63,11 +63,12 @@ def noam_decay(d_model, warmup_steps):
Returns: Returns:
The decayed learning rate. The decayed learning rate.
""" """
global_step = _decay_step_counter(1) with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter(1)
a = global_step**-0.5 a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
return lr_value return lr_value
@ -108,14 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * (decay_rate**div_res) decayed_lr = learning_rate * (decay_rate**div_res)
return decayed_lr return decayed_lr
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@ -136,14 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
Returns: Returns:
The decayed learning rate The decayed learning rate
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
return decayed_lr return decayed_lr
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@ -181,15 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
staircase=True)) staircase=True))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate / (1 + decay_rate * div_res) decayed_lr = learning_rate / (1 + decay_rate * div_res)
return decayed_lr return decayed_lr
def polynomial_decay(learning_rate, def polynomial_decay(learning_rate,
@ -220,25 +224,28 @@ def polynomial_decay(learning_rate,
Returns: Returns:
Variable: The decayed learning rate Variable: The decayed learning rate
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
if cycle:
div_res = ops.ceil(global_step / decay_steps) if cycle:
zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0) div_res = ops.ceil(global_step / decay_steps)
one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0) zero_var = tensor.fill_constant(
shape=[1], dtype='float32', value=0.0)
with control_flow.Switch() as switch: one_var = tensor.fill_constant(
with switch.case(global_step == zero_var): shape=[1], dtype='float32', value=1.0)
tensor.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res with control_flow.Switch() as switch:
else: with switch.case(global_step == zero_var):
decay_steps_var = tensor.fill_constant( tensor.assign(input=one_var, output=div_res)
shape=[1], dtype='float32', value=float(decay_steps)) decay_steps = decay_steps * div_res
global_step = ops.elementwise_min(x=global_step, y=decay_steps_var) else:
decay_steps_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps))
global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \ decayed_lr = (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate ((1 - global_step / decay_steps) ** power) + end_learning_rate
return decayed_lr return decayed_lr
def piecewise_decay(boundaries, values): def piecewise_decay(boundaries, values):
@ -266,34 +273,36 @@ def piecewise_decay(boundaries, values):
""" """
with default_main_program()._lr_schedule_guard():
if len(values) - len(boundaries) != 1:
raise ValueError("len(values) - len(boundaries) should be 1")
if len(values) - len(boundaries) != 1: global_step = _decay_step_counter()
raise ValueError("len(values) - len(boundaries) should be 1")
global_step = _decay_step_counter()
lr = tensor.create_global_var( lr = tensor.create_global_var(
shape=[1], shape=[1],
value=0.0, value=0.0,
dtype='float32', dtype='float32',
persistable=True, persistable=True,
name="learning_rate") name="learning_rate")
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
for i in range(len(boundaries)): for i in range(len(boundaries)):
boundary_val = tensor.fill_constant( boundary_val = tensor.fill_constant(
shape=[1],
dtype='float32',
value=float(boundaries[i]),
force_cpu=True)
value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], shape=[1],
dtype='float32', dtype='float32',
value=float(boundaries[i]), value=float(values[len(values) - 1]))
force_cpu=True) with switch.default():
value_var = tensor.fill_constant( tensor.assign(last_value_var, lr)
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[len(values) - 1]))
with switch.default():
tensor.assign(last_value_var, lr)
return lr return lr

@ -80,7 +80,8 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
endif(NOT APPLE) endif(NOT APPLE)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) #FIXME(gongwb): random fails.
#py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)

@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t) + " in class " + self.__class__.__name__) str(actual_t))
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +

@ -20,7 +20,6 @@ import six
import sys import sys
import collections import collections
import math import math
import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest
@ -33,7 +32,7 @@ class TestDetectionMAPOp(OpTest):
self.detect = np.array(self.detect).astype('float32') self.detect = np.array(self.detect).astype('float32')
self.mAP = np.array(self.mAP).astype('float32') self.mAP = np.array(self.mAP).astype('float32')
if len(self.class_pos_count) > 0: if (len(self.class_pos_count) > 0):
self.class_pos_count = np.array(self.class_pos_count).astype( self.class_pos_count = np.array(self.class_pos_count).astype(
'int32') 'int32')
self.true_pos = np.array(self.true_pos).astype('float32') self.true_pos = np.array(self.true_pos).astype('float32')
@ -274,7 +273,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
def init_test_case(self): def init_test_case(self):
super(TestDetectionMAPOpMultiBatch, self).init_test_case() super(TestDetectionMAPOpMultiBatch, self).init_test_case()
self.class_pos_count = [0, 2, 1, 0] self.class_pos_count = [0, 2, 1]
self.true_pos_lod = [[0, 3, 2]] self.true_pos_lod = [[0, 3, 2]]
self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
self.false_pos_lod = [[0, 3, 2]] self.false_pos_lod = [[0, 3, 2]]

@ -22,7 +22,7 @@ class TestDistMnist2x2(TestDistBase):
self._sync_mode = True self._sync_mode = True
self._use_reduce = False self._use_reduce = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-7)
@ -31,7 +31,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
self._sync_mode = True self._sync_mode = True
self._mem_opt = True self._mem_opt = True
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-7)
@ -40,7 +40,7 @@ class TestDistMnistAsync(TestDistBase):
self._sync_mode = False self._sync_mode = False
self._use_reduce = False self._use_reduce = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=200) self.check_with_place("dist_mnist.py", delta=200)

@ -21,7 +21,16 @@ class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistseResnXt2x2WithMemopt(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._mem_opt = True
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place("dist_se_resnext.py", delta=1e-7)
@ -29,7 +38,7 @@ class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=100)

@ -59,7 +59,7 @@ class TestDistTransformer2x2Sync(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
def test_transformer(self): def test_dist_train(self):
download_files() download_files()
self.check_with_place("dist_transformer.py", delta=1e-5) self.check_with_place("dist_transformer.py", delta=1e-5)
@ -68,7 +68,7 @@ class TestDistTransformer2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
def test_transformer(self): def test_dist_train(self):
download_files() download_files()
self.check_with_place("dist_transformer.py", delta=1.0) self.check_with_place("dist_transformer.py", delta=1.0)

@ -17,19 +17,28 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistW2V2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4) self.check_with_place("dist_word2vec.py", delta=1e-4)
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistW2V2x2WithMemOpt(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._mem_opt = True
def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4)
class TestDistW2V2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1) self.check_with_place("dist_word2vec.py", delta=1)

@ -21,13 +21,12 @@ import paddle
def delete_ops(block, ops): def delete_ops(block, ops):
try: for op in ops:
start = list(block.ops).index(ops[0]) try:
end = list(block.ops).index(ops[-1]) idx = list(block.ops).index(op)
[block._remove_op(start) for _ in six.moves.range(end - start + 1)] block._remove_op(idx)
except Exception as e: except Exception as e:
raise e print(e)
block.program._sync_with_cpp()
def find_op_by_input_arg(block, arg_name): def find_op_by_input_arg(block, arg_name):
@ -37,10 +36,18 @@ def find_op_by_input_arg(block, arg_name):
return -1 return -1
def find_op_by_output_arg(block, arg_name): def find_op_by_output_arg(block, arg_name, reverse=False):
for index, op in enumerate(block.ops): if reverse:
if arg_name in op.output_arg_names: pos = len(block.ops) - 1
return index while pos >= 0:
op = block.ops[pos]
if arg_name in op.output_arg_names:
return pos
pos -= 1
else:
for index, op in enumerate(block.ops):
if arg_name in op.output_arg_names:
return index
return -1 return -1

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save