diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 368a1f5612..c95077fcbd 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {
 
 template <>
-void getAccumulators<paddle::platform::CPUDeviceContext>(
+void GetAccumulators<paddle::platform::CPUDeviceContext>(
     const framework::ExecutionContext& ctx, int64_t& num_updates_,
     int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
@@ -31,7 +31,7 @@ void getAccumulators<paddle::platform::CPUDeviceContext>(
 }
 
 template <>
-void setAccumulators<paddle::platform::CPUDeviceContext>(
+void SetAccumulators<paddle::platform::CPUDeviceContext>(
     const framework::ExecutionContext& ctx, int64_t num_updates_,
     int64_t num_accumulates_, int64_t old_num_accumulates_) {
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
@@ -113,60 +113,92 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("param",
-             "Input(Tensor or LoDTensor): The parameter to be accumulated.");
+    AddInput("param", "(Tensor), The parameter to be accumulated.");
     AddInput("in_sum_1",
-             "Input(Tensor or LoDTensor): A tensor used to store the parameter "
+             "(Tensor), A tensor used to store the parameter "
              "sums with the same shape as input(param).");
     AddInput("in_sum_2",
-             "Input(Tensor or LoDTensor): A auxiliary tensor to help "
+             "(Tensor), A auxiliary tensor to help "
              "accumulating sums of parameter values with the same shape as "
              "input(param). It is used to avoid loss of precision due to too "
              "many sums.");
     AddInput("in_sum_3",
-             "Input(Tensor or LoDTensor): A auxiliary tensor to help "
+             "(Tensor), A auxiliary tensor to help "
              "accumulating sums of parameter values with the same shape as "
              "input(param).");
     AddInput("in_num_accumulates",
-             "Input(Tensor): The accumulating times of current window with "
-             "shape [1].");
-    AddInput("in_old_num_accumulates",
-             "Input(Tensor): The accumulating times of previous window with "
+             "(Tensor<int64_t>), The accumulating times of current window with "
              "shape [1].");
+    AddInput(
+        "in_old_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of previous window with "
+        "shape [1].");
     AddInput("in_num_updates",
-             "Input(Tensor): The total number of batches used by trainning "
+             "(Tensor<int64_t>), The total number of batches used by trainning "
              "before this batch with shape [1].");
 
     AddOutput("out_sum_1",
-              "Output(Tensor or LoDTensor): A tensor used to store the "
+              "(Tensor), A tensor used to store the "
               "parameter sums with the same shape as input(param).");
     AddOutput("out_sum_2",
-              "Output(Tensor or LoDTensor): A auxiliary tensor to help "
+              "(Tensor), A auxiliary tensor to help "
               "accumulating sums of parameter values with the same shape as "
               "input(param). It is used to avoid loss of precision due to too "
               "many sums.");
     AddOutput("out_sum_3",
-              "Output(Tensor or LoDTensor): A auxiliary tensor to help "
+              "(Tensor), A auxiliary tensor to help "
               "accumulating sums of parameter values with the same shape as "
               "input(param).");
-    AddOutput("out_num_accumulates",
-              "Output(Tensor): The accumulating times of current window with "
-              "shape [1].");
-    AddOutput("out_old_num_accumulates",
-              "Output(Tensor): The accumulating times of previous window with "
-              "shape [1].");
-    AddOutput("out_num_updates",
-              "Output(Tensor): The total number of batches used by trainning "
-              "before this batch with shape [1].");
+    AddOutput(
+        "out_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of current window with "
+        "shape [1].");
+    AddOutput(
+        "out_old_num_accumulates",
+        "(Tensor<int64_t>) The accumulating times of previous window with "
+        "shape [1].");
+    AddOutput(
+        "out_num_updates",
+        "(Tensor<int64_t>), The total number of batches used by trainning "
+        "before this batch with shape [1].");
 
     AddAttr<float>("average_window",
-                   "The rate of average window size relative to num_updates.");
-    AddAttr<int64_t>("max_average_window", "Maximum size of average window.");
-    AddAttr<int64_t>("min_average_window", "Minimu size of average window.");
+                   "(float, default 0) "
+                   "The rate of average window size relative to num_updates.")
+        .SetDefault(0);
+    AddAttr<int64_t>("max_average_window",
+                     "(int64_t) "
+                     "Maximum size of average window. It suggests that the "
+                     "number of mini-batches "
+                     "in one pass is appropriate value to set.");
+    AddAttr<int64_t>("min_average_window",
+                     "(int64_t, default 10000L) "
+                     "Minimu size of average window.")
+        .SetDefault(10000L);
 
     AddComment(R"DOC(
 AverageAccumulates Operator.
-Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
+'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
+
+All the accumulators were inited to zero before training.
+
+And for a mini-batch in training, accumulators were computed as below steps:
+    num_updates += 1
+    num_accumulates += 1
+    sum_1 += param
+    if num_updates % kMaxNumAccumulates == 0:
+        sum_2 += sum_1
+        sum_1 = 0
+    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
+        sum_3 = sum_1 + sum_2
+        sum_1 = 0
+        sum_2 = 0
+        old_num_accumulates = num_accumulates
+        num_accumulates = 0
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index dbaa8ba6c9..270c469844 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 template <>
-void getAccumulators<paddle::platform::CUDADeviceContext>(
+void GetAccumulators<paddle::platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx, int64_t& num_updates_,
     int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
@@ -35,7 +35,7 @@ void getAccumulators<paddle::platform::CUDADeviceContext>(
 }
 
 template <>
-void setAccumulators<paddle::platform::CUDADeviceContext>(
+void SetAccumulators<paddle::platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx, int64_t num_updates_,
     int64_t num_accumulates_, int64_t old_num_accumulates_) {
   auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index d33fd5519a..f858109d14 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -28,12 +28,12 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename DeviceContext>
-void getAccumulators(const framework::ExecutionContext& ctx,
+void GetAccumulators(const framework::ExecutionContext& ctx,
                      int64_t& num_updates, int64_t& num_accumulates,
                      int64_t& old_num_accumulates);
 
 template <typename DeviceContext>
-void setAccumulators(const framework::ExecutionContext& ctx,
+void SetAccumulators(const framework::ExecutionContext& ctx,
                      int64_t num_updates, int64_t num_accumulates,
                      int64_t old_num_accumulates);
 
@@ -47,7 +47,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     int64_t num_updates = 0;
     int64_t num_accumulates = 0;
     int64_t old_num_accumulates = 0;
-    getAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
                                    old_num_accumulates);
 
     // Get attrs
@@ -84,6 +84,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     out_sum_2_tensor.device(place) = in_sum_2_tensor;
     out_sum_3_tensor.device(place) = in_sum_3_tensor;
     if (num_updates % kMaxNumAccumulates == 0) {
+      // Move the sum to a different buffer to avoid loss of precision due to
+      // too many sums.
       out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
       constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                        0.0);
@@ -91,6 +93,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     if (num_accumulates >= min_average_window &&
         num_accumulates >= std::min<int64_t>(max_average_window,
                                              num_updates * average_window)) {
+      //  Now the average window is too long, discard the old sum.
       out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
       constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                        0.0);
@@ -101,7 +104,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     }
 
     // Set accumulators to output
-    setAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
                                    old_num_accumulates);
   }
 };
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 394cf050a7..d8373eaab4 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -732,7 +732,6 @@ class ModelAverage(Optimizer):
         """Apply average values to parameters of current model.
         """
         executor.run(self.apply_program)
-        print "finish apply"
         try:
             yield
         finally:
@@ -743,4 +742,3 @@ class ModelAverage(Optimizer):
         """Restore parameter values of current model.
         """
         executor.run(self.restore_program)
-        print "finish restore"