|
|
|
@ -18,7 +18,7 @@ namespace paddle {
|
|
|
|
|
namespace operators {
|
|
|
|
|
|
|
|
|
|
template <>
|
|
|
|
|
void getAccumulators<paddle::platform::CPUDeviceContext>(
|
|
|
|
|
void GetAccumulators<paddle::platform::CPUDeviceContext>(
|
|
|
|
|
const framework::ExecutionContext& ctx, int64_t& num_updates_,
|
|
|
|
|
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
|
|
|
|
|
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
|
|
|
|
@ -31,7 +31,7 @@ void getAccumulators<paddle::platform::CPUDeviceContext>(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <>
|
|
|
|
|
void setAccumulators<paddle::platform::CPUDeviceContext>(
|
|
|
|
|
void SetAccumulators<paddle::platform::CPUDeviceContext>(
|
|
|
|
|
const framework::ExecutionContext& ctx, int64_t num_updates_,
|
|
|
|
|
int64_t num_accumulates_, int64_t old_num_accumulates_) {
|
|
|
|
|
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
|
|
|
|
@ -113,60 +113,92 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
|
|
|
|
|
public:
|
|
|
|
|
AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
|
|
|
|
|
: OpProtoAndCheckerMaker(proto, op_checker) {
|
|
|
|
|
AddInput("param",
|
|
|
|
|
"Input(Tensor or LoDTensor): The parameter to be accumulated.");
|
|
|
|
|
AddInput("param", "(Tensor), The parameter to be accumulated.");
|
|
|
|
|
AddInput("in_sum_1",
|
|
|
|
|
"Input(Tensor or LoDTensor): A tensor used to store the parameter "
|
|
|
|
|
"(Tensor), A tensor used to store the parameter "
|
|
|
|
|
"sums with the same shape as input(param).");
|
|
|
|
|
AddInput("in_sum_2",
|
|
|
|
|
"Input(Tensor or LoDTensor): A auxiliary tensor to help "
|
|
|
|
|
"(Tensor), A auxiliary tensor to help "
|
|
|
|
|
"accumulating sums of parameter values with the same shape as "
|
|
|
|
|
"input(param). It is used to avoid loss of precision due to too "
|
|
|
|
|
"many sums.");
|
|
|
|
|
AddInput("in_sum_3",
|
|
|
|
|
"Input(Tensor or LoDTensor): A auxiliary tensor to help "
|
|
|
|
|
"(Tensor), A auxiliary tensor to help "
|
|
|
|
|
"accumulating sums of parameter values with the same shape as "
|
|
|
|
|
"input(param).");
|
|
|
|
|
AddInput("in_num_accumulates",
|
|
|
|
|
"Input(Tensor): The accumulating times of current window with "
|
|
|
|
|
"shape [1].");
|
|
|
|
|
AddInput("in_old_num_accumulates",
|
|
|
|
|
"Input(Tensor): The accumulating times of previous window with "
|
|
|
|
|
"(Tensor<int64_t>), The accumulating times of current window with "
|
|
|
|
|
"shape [1].");
|
|
|
|
|
AddInput(
|
|
|
|
|
"in_old_num_accumulates",
|
|
|
|
|
"(Tensor<int64_t>), The accumulating times of previous window with "
|
|
|
|
|
"shape [1].");
|
|
|
|
|
AddInput("in_num_updates",
|
|
|
|
|
"Input(Tensor): The total number of batches used by trainning "
|
|
|
|
|
"(Tensor<int64_t>), The total number of batches used by trainning "
|
|
|
|
|
"before this batch with shape [1].");
|
|
|
|
|
|
|
|
|
|
AddOutput("out_sum_1",
|
|
|
|
|
"Output(Tensor or LoDTensor): A tensor used to store the "
|
|
|
|
|
"(Tensor), A tensor used to store the "
|
|
|
|
|
"parameter sums with the same shape as input(param).");
|
|
|
|
|
AddOutput("out_sum_2",
|
|
|
|
|
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
|
|
|
|
|
"(Tensor), A auxiliary tensor to help "
|
|
|
|
|
"accumulating sums of parameter values with the same shape as "
|
|
|
|
|
"input(param). It is used to avoid loss of precision due to too "
|
|
|
|
|
"many sums.");
|
|
|
|
|
AddOutput("out_sum_3",
|
|
|
|
|
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
|
|
|
|
|
"(Tensor), A auxiliary tensor to help "
|
|
|
|
|
"accumulating sums of parameter values with the same shape as "
|
|
|
|
|
"input(param).");
|
|
|
|
|
AddOutput("out_num_accumulates",
|
|
|
|
|
"Output(Tensor): The accumulating times of current window with "
|
|
|
|
|
"shape [1].");
|
|
|
|
|
AddOutput("out_old_num_accumulates",
|
|
|
|
|
"Output(Tensor): The accumulating times of previous window with "
|
|
|
|
|
"shape [1].");
|
|
|
|
|
AddOutput("out_num_updates",
|
|
|
|
|
"Output(Tensor): The total number of batches used by trainning "
|
|
|
|
|
"before this batch with shape [1].");
|
|
|
|
|
AddOutput(
|
|
|
|
|
"out_num_accumulates",
|
|
|
|
|
"(Tensor<int64_t>), The accumulating times of current window with "
|
|
|
|
|
"shape [1].");
|
|
|
|
|
AddOutput(
|
|
|
|
|
"out_old_num_accumulates",
|
|
|
|
|
"(Tensor<int64_t>) The accumulating times of previous window with "
|
|
|
|
|
"shape [1].");
|
|
|
|
|
AddOutput(
|
|
|
|
|
"out_num_updates",
|
|
|
|
|
"(Tensor<int64_t>), The total number of batches used by trainning "
|
|
|
|
|
"before this batch with shape [1].");
|
|
|
|
|
|
|
|
|
|
AddAttr<float>("average_window",
|
|
|
|
|
"The rate of average window size relative to num_updates.");
|
|
|
|
|
AddAttr<int64_t>("max_average_window", "Maximum size of average window.");
|
|
|
|
|
AddAttr<int64_t>("min_average_window", "Minimu size of average window.");
|
|
|
|
|
"(float, default 0) "
|
|
|
|
|
"The rate of average window size relative to num_updates.")
|
|
|
|
|
.SetDefault(0);
|
|
|
|
|
AddAttr<int64_t>("max_average_window",
|
|
|
|
|
"(int64_t) "
|
|
|
|
|
"Maximum size of average window. It suggests that the "
|
|
|
|
|
"number of mini-batches "
|
|
|
|
|
"in one pass is appropriate value to set.");
|
|
|
|
|
AddAttr<int64_t>("min_average_window",
|
|
|
|
|
"(int64_t, default 10000L) "
|
|
|
|
|
"Minimu size of average window.")
|
|
|
|
|
.SetDefault(10000L);
|
|
|
|
|
|
|
|
|
|
AddComment(R"DOC(
|
|
|
|
|
AverageAccumulates Operator.
|
|
|
|
|
Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'.
|
|
|
|
|
Accumulate the sum of parameter whtin sliding window. The size of sliding window is
|
|
|
|
|
determined by 'average_window', 'max_average_window' and 'min_average_window'.
|
|
|
|
|
Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
|
|
|
|
|
'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
|
|
|
|
|
|
|
|
|
|
All the accumulators were inited to zero before training.
|
|
|
|
|
|
|
|
|
|
And for a mini-batch in training, accumulators were computed as below steps:
|
|
|
|
|
num_updates += 1
|
|
|
|
|
num_accumulates += 1
|
|
|
|
|
sum_1 += param
|
|
|
|
|
if num_updates % kMaxNumAccumulates == 0:
|
|
|
|
|
sum_2 += sum_1
|
|
|
|
|
sum_1 = 0
|
|
|
|
|
if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
|
|
|
|
|
sum_3 = sum_1 + sum_2
|
|
|
|
|
sum_1 = 0
|
|
|
|
|
sum_2 = 0
|
|
|
|
|
old_num_accumulates = num_accumulates
|
|
|
|
|
num_accumulates = 0
|
|
|
|
|
|
|
|
|
|
)DOC");
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|