diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index 368a1f5612..c95077fcbd 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -18,7 +18,7 @@ namespace paddle { namespace operators { template <> -void getAccumulators( +void GetAccumulators( const framework::ExecutionContext& ctx, int64_t& num_updates_, int64_t& num_accumulates_, int64_t& old_num_accumulates_) { auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); @@ -31,7 +31,7 @@ void getAccumulators( } template <> -void setAccumulators( +void SetAccumulators( const framework::ExecutionContext& ctx, int64_t num_updates_, int64_t num_accumulates_, int64_t old_num_accumulates_) { auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); @@ -113,60 +113,92 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker { public: AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("param", - "Input(Tensor or LoDTensor): The parameter to be accumulated."); + AddInput("param", "(Tensor), The parameter to be accumulated."); AddInput("in_sum_1", - "Input(Tensor or LoDTensor): A tensor used to store the parameter " + "(Tensor), A tensor used to store the parameter " "sums with the same shape as input(param)."); AddInput("in_sum_2", - "Input(Tensor or LoDTensor): A auxiliary tensor to help " + "(Tensor), A auxiliary tensor to help " "accumulating sums of parameter values with the same shape as " "input(param). It is used to avoid loss of precision due to too " "many sums."); AddInput("in_sum_3", - "Input(Tensor or LoDTensor): A auxiliary tensor to help " + "(Tensor), A auxiliary tensor to help " "accumulating sums of parameter values with the same shape as " "input(param)."); AddInput("in_num_accumulates", - "Input(Tensor): The accumulating times of current window with " - "shape [1]."); - AddInput("in_old_num_accumulates", - "Input(Tensor): The accumulating times of previous window with " + "(Tensor), The accumulating times of current window with " "shape [1]."); + AddInput( + "in_old_num_accumulates", + "(Tensor), The accumulating times of previous window with " + "shape [1]."); AddInput("in_num_updates", - "Input(Tensor): The total number of batches used by trainning " + "(Tensor), The total number of batches used by trainning " "before this batch with shape [1]."); AddOutput("out_sum_1", - "Output(Tensor or LoDTensor): A tensor used to store the " + "(Tensor), A tensor used to store the " "parameter sums with the same shape as input(param)."); AddOutput("out_sum_2", - "Output(Tensor or LoDTensor): A auxiliary tensor to help " + "(Tensor), A auxiliary tensor to help " "accumulating sums of parameter values with the same shape as " "input(param). It is used to avoid loss of precision due to too " "many sums."); AddOutput("out_sum_3", - "Output(Tensor or LoDTensor): A auxiliary tensor to help " + "(Tensor), A auxiliary tensor to help " "accumulating sums of parameter values with the same shape as " "input(param)."); - AddOutput("out_num_accumulates", - "Output(Tensor): The accumulating times of current window with " - "shape [1]."); - AddOutput("out_old_num_accumulates", - "Output(Tensor): The accumulating times of previous window with " - "shape [1]."); - AddOutput("out_num_updates", - "Output(Tensor): The total number of batches used by trainning " - "before this batch with shape [1]."); + AddOutput( + "out_num_accumulates", + "(Tensor), The accumulating times of current window with " + "shape [1]."); + AddOutput( + "out_old_num_accumulates", + "(Tensor) The accumulating times of previous window with " + "shape [1]."); + AddOutput( + "out_num_updates", + "(Tensor), The total number of batches used by trainning " + "before this batch with shape [1]."); AddAttr("average_window", - "The rate of average window size relative to num_updates."); - AddAttr("max_average_window", "Maximum size of average window."); - AddAttr("min_average_window", "Minimu size of average window."); + "(float, default 0) " + "The rate of average window size relative to num_updates.") + .SetDefault(0); + AddAttr("max_average_window", + "(int64_t) " + "Maximum size of average window. It suggests that the " + "number of mini-batches " + "in one pass is appropriate value to set."); + AddAttr("min_average_window", + "(int64_t, default 10000L) " + "Minimu size of average window.") + .SetDefault(10000L); AddComment(R"DOC( AverageAccumulates Operator. -Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'. +Accumulate the sum of parameter whtin sliding window. The size of sliding window is +determined by 'average_window', 'max_average_window' and 'min_average_window'. +Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'. +'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'. + +All the accumulators were inited to zero before training. + +And for a mini-batch in training, accumulators were computed as below steps: + num_updates += 1 + num_accumulates += 1 + sum_1 += param + if num_updates % kMaxNumAccumulates == 0: + sum_2 += sum_1 + sum_1 = 0 + if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window): + sum_3 = sum_1 + sum_2 + sum_1 = 0 + sum_2 = 0 + old_num_accumulates = num_accumulates + num_accumulates = 0 + )DOC"); } }; diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu index dbaa8ba6c9..270c469844 100644 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ namespace paddle { namespace operators { template <> -void getAccumulators( +void GetAccumulators( const framework::ExecutionContext& ctx, int64_t& num_updates_, int64_t& num_accumulates_, int64_t& old_num_accumulates_) { auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); @@ -35,7 +35,7 @@ void getAccumulators( } template <> -void setAccumulators( +void SetAccumulators( const framework::ExecutionContext& ctx, int64_t num_updates_, int64_t num_accumulates_, int64_t old_num_accumulates_) { auto stream = ctx.cuda_device_context().stream(); diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h index d33fd5519a..f858109d14 100644 --- a/paddle/fluid/operators/average_accumulates_op.h +++ b/paddle/fluid/operators/average_accumulates_op.h @@ -28,12 +28,12 @@ template ; template -void getAccumulators(const framework::ExecutionContext& ctx, +void GetAccumulators(const framework::ExecutionContext& ctx, int64_t& num_updates, int64_t& num_accumulates, int64_t& old_num_accumulates); template -void setAccumulators(const framework::ExecutionContext& ctx, +void SetAccumulators(const framework::ExecutionContext& ctx, int64_t num_updates, int64_t num_accumulates, int64_t old_num_accumulates); @@ -47,7 +47,7 @@ class AverageAccumulatesKernel : public framework::OpKernel { int64_t num_updates = 0; int64_t num_accumulates = 0; int64_t old_num_accumulates = 0; - getAccumulators(ctx, num_updates, num_accumulates, + GetAccumulators(ctx, num_updates, num_accumulates, old_num_accumulates); // Get attrs @@ -84,6 +84,8 @@ class AverageAccumulatesKernel : public framework::OpKernel { out_sum_2_tensor.device(place) = in_sum_2_tensor; out_sum_3_tensor.device(place) = in_sum_3_tensor; if (num_updates % kMaxNumAccumulates == 0) { + // Move the sum to a different buffer to avoid loss of precision due to + // too many sums. out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; constant_functor(ctx.template device_context(), out_sum_1, 0.0); @@ -91,6 +93,7 @@ class AverageAccumulatesKernel : public framework::OpKernel { if (num_accumulates >= min_average_window && num_accumulates >= std::min(max_average_window, num_updates * average_window)) { + // Now the average window is too long, discard the old sum. out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; constant_functor(ctx.template device_context(), out_sum_1, 0.0); @@ -101,7 +104,7 @@ class AverageAccumulatesKernel : public framework::OpKernel { } // Set accumulators to output - setAccumulators(ctx, num_updates, num_accumulates, + SetAccumulators(ctx, num_updates, num_accumulates, old_num_accumulates); } }; diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 394cf050a7..d8373eaab4 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -732,7 +732,6 @@ class ModelAverage(Optimizer): """Apply average values to parameters of current model. """ executor.run(self.apply_program) - print "finish apply" try: yield finally: @@ -743,4 +742,3 @@ class ModelAverage(Optimizer): """Restore parameter values of current model. """ executor.run(self.restore_program) - print "finish restore"