Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into build_manually

Merge branch develop
7 years ago · 53a4a583e0
parent 154a1db049 259ef4a5d6
commit 53a4a583e0
61 changed files with 2099 additions and 422 deletions
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@ -18,12 +18,13 @@ import sys
 import time
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
-import paddle.v2.fluid.profiler as profiler
+import paddle.fluid.profiler as profiler
 import argparse
 import functools
 import os
 from paddle.fluid import debuger
 def str2bool(v):
@ -182,28 +183,27 @@ def main():
            start_time = time.time()
            num_samples = 0
            train_pass_acc.reset()
-            with profiler.profiler("CPU", 'total') as prof:
+            for batch_id, data in enumerate(train_reader()):
-                for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
-                    ts = time.time()
+                img_data = np.array(
-                    img_data = np.array(
+                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                        "float32")
-                            "float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape([-1, 1])
-                    y_data = y_data.reshape([-1, 1])
+
-
+                loss, acc, b_size = exe.run(
-                    loss, acc, b_size = exe.run(
+                    trainer_prog,
-                        trainer_prog,
+                    feed={"pixel": img_data,
-                        feed={"pixel": img_data,
+                          "label": y_data},
-                              "label": y_data},
+                    fetch_list=[avg_cost, batch_acc, batch_size])
-                        fetch_list=[avg_cost, batch_acc, batch_size])
+                iters += 1
-                    iters += 1
+                num_samples += len(data)
-                    num_samples += len(data)
+                train_pass_acc.add(value=acc, weight=b_size)
-                    train_pass_acc.add(value=acc, weight=b_size)
+                print(
-                    print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                    % (pass_id, iters, loss, acc,
-                        % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - ts))
-                           len(data) / (time.time() - ts))
+                )  # The accuracy is the accumulation of batches, but not the current batch.
                    )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
            pass_train_acc = train_pass_acc.eval()
@ -254,9 +254,7 @@ def main():
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            print("starting server side startup")
            exe.run(pserver_startup)
            print("starting parameter server...")
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            # Parameter initialization
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server):
        return np.mean(test_accs)
    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1,
        log_device_placement=True)
    config.gpu_options.allow_growth = True
    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
    with tf.train.MonitoredTrainingSession(
-            master=server.target, is_chief=(args.task_index == 0),
+            master=server.target,
-            hooks=hooks) as sess:
+            is_chief=(args.task_index == 0),
            hooks=hooks,
            config=config) as sess:
        iters, num_samples, start_time = 0, 0, 0.0
        for pass_id in range(args.num_passes):
            # train
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -1 +1,2 @@
 add_subdirectory(v2)
 add_subdirectory(fluid)
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@ -0,0 +1,49 @@
 if(NOT DEFINED SPHINX_THEME)
    set(SPHINX_THEME default)
 endif()
 if(NOT DEFINED SPHINX_THEME_DIR)
    set(SPHINX_THEME_DIR)
 endif()
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
 # Sphinx cache with pickled ReST documents
 set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"
    @ONLY)
 sphinx_add_target(paddle_fluid_docs
                  html
                  ${BINARY_BUILD_DIR_EN}
                  ${SPHINX_CACHE_DIR_EN}
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 # Sphinx cache with pickled ReST documents
 set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
 # HTML output directory
 set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
    "${BINARY_BUILD_DIR_CN}/conf.py"
    @ONLY)
 sphinx_add_target(paddle_fluid_docs_cn
                  html
                  ${BINARY_BUILD_DIR_CN}
                  ${SPHINX_CACHE_DIR_CN}
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
@ -0,0 +1,2 @@
 安装与使用
 ------------
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
@ -0,0 +1,2 @@
 Build and Install
 ------------
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
@ -0,0 +1,2 @@
 设计思想
 ------------
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
@ -0,0 +1,2 @@
 Design
 ------------
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@ -0,0 +1,2 @@
 开发标准
 ------------
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@ -0,0 +1,4 @@
 Development
 ------------
 This is Development page
--- a/doc/fluid/faq/index_cn.rst
+++ b/doc/fluid/faq/index_cn.rst
@ -0,0 +1,2 @@
 FAQ
 ------------
--- a/doc/fluid/faq/index_en.rst
+++ b/doc/fluid/faq/index_en.rst
@ -0,0 +1,2 @@
 FAQ
 ------------
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
@ -0,0 +1,4 @@
 新手入门
 ------------
 新手入门
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
@ -0,0 +1,4 @@
 GET STARTED
 ------------
 This is get started page
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@ -0,0 +1,145 @@
 # Fluid 分布式版本使用指南
 本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
 ## 准备工作
 * 可用的集群
    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
 * 安装PaddlePaddle Fluid with Distribution版本
    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
 ``` bash
 cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
 ```
 ## 更新训练脚本
 这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
 ### 单机训练脚本示例
 ```python
 import paddle.v2 as paddle
 import paddle.fluid as fluid
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 y_predict = fluid.layers.fc(input=x, size=1, act=None)
 y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
 avg_cost = fluid.layers.mean(x=cost)
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
 sgd_optimizer.minimize(avg_cost)
 BATCH_SIZE = 20
 train_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.uci_housing.train(), buf_size=500),
    batch_size=BATCH_SIZE)
 place = fluid.CPUPlace()
 feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
 exe = fluid.Executor(place)
 exe.run(fluid.default_startup_program())
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
    fluid.io.save_persistables(exe, "./fit_a_line.model/")
    fluid.io.load_persistables(exe, "./fit_a_line.model/")
    for data in train_reader():
        avg_loss_value, = exe.run(fluid.default_main_program(),
                                  feed=feeder.feed(data),
                                  fetch_list=[avg_cost])
        if avg_loss_value[0] < 10.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
 ```
 我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
 ### 介绍Parameter Server
 在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
 **因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
 ### 分布式训练 
 Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
 ```python
 optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
 ```
 将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
 ```python
 ... #define the program, cost, and create sgd optimizer
 optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
 t = fluid.DistributeTranspiler() # create the transpiler instance
 # slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
 t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
 ... #create executor
 # in pserver, run this
 #current_endpoint here means current pserver IP:PORT you wish to run on
 pserver_prog = t.get_pserver_program(current_endpoint)
 pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
 exe.run(pserver_startup)
 exe.run(pserver_prog)
 # in trainer, run this
 ... # define data reader
 exe.run(fluid.default_startup_program())
 for pass_id in range(100):
    for data in train_reader():
        exe.run(t.get_trainer_program())
 ```
 ### 分布式训练脚本运行说明
 分布式任务的运行需要将表格中说明的多个参数进行赋值:
 | 参数名 | 值类型 | 说明 | 示例 |
 |:-------------|:------|:---------------------------------------|:-------------|
 | trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
 | pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
 | trainers | int | 训练节点的总个数，>0的数字 | 4 |
 | server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
 | training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
 **注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
 ```python
 t = fluid.DistributeTranspiler()
 t.transpile(
    optimize_ops,
    params_grads,
    trainer_id,
    pservers=pserver,
    trainers=trainers)
 if training_role == "PSERVER":
    pserver_prog = t.get_pserver_program(server_endpoint)
    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
 ```
 ### Demo
 完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
 第一步，进入demo代码所在目录：
 ```bash
 cd /paddle/python/paddle/fluid/tests/book
 ```
 第二步，启动Parameter Server：
 ```bash
 PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
 ```
 执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
 第三步，启动Trainer：
 ```bash
 PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
 ```
 由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
 现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@ -0,0 +1,2 @@
 进阶使用
 ------------
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@ -0,0 +1,4 @@
 HOW TO
 ------------
 This is how to page
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
@ -0,0 +1,12 @@
 PaddlePaddle Fluid
 ==========================
 ..  toctree::
  :maxdepth: 1
  getstarted/index_cn.rst
  design/index_cn.rst
  build_and_install/index_cn.rst
  howto/index_cn.rst
  dev/index_cn.rst
  faq/index_cn.rst
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
@ -0,0 +1,12 @@
 PaddlePaddle Fluid
 ==========================
 ..  toctree::
  :maxdepth: 1
  getstarted/index_en.rst
  design/index_en.rst
  build_and_install/index_en.rst
  howto/index_en.rst
  dev/index_en.rst
  faq/index_en.rst
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@ -613,3 +613,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
 REGISTER_OP_CPU_KERNEL(relu,
                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
                                             ops::ReluFunctor<float>>,
                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
                                             ops::ReluFunctor<double>>);
 REGISTER_OP_CPU_KERNEL(
    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                                         ops::ReluGradFunctor<float>>,
    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@ -14,6 +14,7 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
@ -31,3 +32,16 @@ namespace ops = paddle::operators;
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
 REGISTER_OP_CUDA_KERNEL(
    relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
                                ops::ReluFunctor<float>>,
    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
                          ops::ReluFunctor<double>>,
    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
                          ops::ReluFunctor<paddle::platform::float16>>);
 REGISTER_OP_CUDA_KERNEL(
    relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
                                         ops::ReluGradFunctor<float>>,
    ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@ -772,7 +772,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@ -0,0 +1,216 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/average_accumulates_op.h"
 namespace paddle {
 namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
    const framework::ExecutionContext& ctx, int64_t& num_updates_,
    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
  num_updates_ = in_num_updates->data<int64_t>()[0];
 }
 template <>
 void SetAccumulators<paddle::platform::CPUDeviceContext>(
    const framework::ExecutionContext& ctx, int64_t num_updates_,
    int64_t num_accumulates_, int64_t old_num_accumulates_) {
  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
  out_num_updates->data<int64_t>()[0] = num_updates_;
 }
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(
        ctx->HasInput("param"),
        "Input (param) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasInput("in_sum_1"),
        "Input (sum_1) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasInput("in_sum_2"),
        "Input (sum_2) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasInput("in_sum_3"),
        "Input (sum_3) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasInput("in_num_accumulates"),
        "Input (in_num_accumulates) of average_accumulates op should "
        "not be null.");
    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
                   "Input (old_num_accumulates) of average_accumulates op "
                   "should not be null.");
    PADDLE_ENFORCE(
        ctx->HasInput("in_num_updates"),
        "Input (num_updates) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasOutput("out_sum_1"),
        "Output (sum_1) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasOutput("out_sum_2"),
        "Output (sum_2) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasOutput("out_sum_3"),
        "Output (sum_3) of average_accumulates op should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
                   "Output (num_accumulates) of average_accumulates op should "
                   "not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
                   "Output (old_num_accumulates) of average_accumulates op "
                   "should not be null.");
    PADDLE_ENFORCE(
        ctx->HasOutput("out_num_updates"),
        "Output (num_updates) of average_accumulates op should not be null.");
    auto in_dim = ctx->GetInputDim("param");
    ctx->SetOutputDim("out_sum_1", in_dim);
    ctx->SetOutputDim("out_sum_2", in_dim);
    ctx->SetOutputDim("out_sum_3", in_dim);
    ctx->SetOutputDim("out_num_accumulates", {1});
    ctx->SetOutputDim("out_old_num_accumulates", {1});
    ctx->SetOutputDim("out_num_updates", {1});
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
        ctx.GetPlace());
  }
 };
 class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("param", "(Tensor), The parameter to be accumulated.");
    AddInput("in_sum_1",
             "(Tensor), A tensor used to store the parameter "
             "sums with the same shape as input(param).");
    AddInput("in_sum_2",
             "(Tensor), A auxiliary tensor to help "
             "accumulating sums of parameter values with the same shape as "
             "input(param). It is used to avoid loss of precision due to too "
             "many sums.");
    AddInput("in_sum_3",
             "(Tensor), A auxiliary tensor to help "
             "accumulating sums of parameter values with the same shape as "
             "input(param).");
    AddInput("in_num_accumulates",
             "(Tensor<int64_t>), The accumulating times of current window with "
             "shape [1].");
    AddInput(
        "in_old_num_accumulates",
        "(Tensor<int64_t>), The accumulating times of previous window with "
        "shape [1].");
    AddInput("in_num_updates",
             "(Tensor<int64_t>), The total number of batches used by trainning "
             "before this batch with shape [1].");
    AddOutput("out_sum_1",
              "(Tensor), A tensor used to store the "
              "parameter sums with the same shape as input(param).");
    AddOutput("out_sum_2",
              "(Tensor), A auxiliary tensor to help "
              "accumulating sums of parameter values with the same shape as "
              "input(param). It is used to avoid loss of precision due to too "
              "many sums.");
    AddOutput("out_sum_3",
              "(Tensor), A auxiliary tensor to help "
              "accumulating sums of parameter values with the same shape as "
              "input(param).");
    AddOutput(
        "out_num_accumulates",
        "(Tensor<int64_t>), The accumulating times of current window with "
        "shape [1].");
    AddOutput(
        "out_old_num_accumulates",
        "(Tensor<int64_t>) The accumulating times of previous window with "
        "shape [1].");
    AddOutput(
        "out_num_updates",
        "(Tensor<int64_t>), The total number of batches used by trainning "
        "before this batch with shape [1].");
    AddAttr<float>("average_window",
                   "(float, default 0) "
                   "The rate of average window size relative to num_updates.")
        .SetDefault(0);
    AddAttr<int64_t>("max_average_window",
                     "(int64_t) "
                     "Maximum size of average window. It suggests that the "
                     "number of mini-batches "
                     "in one pass is appropriate value to set.");
    AddAttr<int64_t>("min_average_window",
                     "(int64_t, default 10000L) "
                     "Minimu size of average window.")
        .SetDefault(10000L);
    AddComment(R"DOC(
 AverageAccumulates Operator.
 Accumulate the sum of parameter whtin sliding window. The size of sliding window is
 determined by 'average_window', 'max_average_window' and 'min_average_window'.
 Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
 'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
 All the accumulators were inited to zero before training.
 And for a mini-batch in training, accumulators were computed as below steps:
    num_updates += 1
    num_accumulates += 1
    sum_1 += param
    if num_updates % kMaxNumAccumulates == 0:
        sum_2 += sum_1
        sum_1 = 0
    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
        sum_3 = sum_1 + sum_2
        sum_1 = 0
        sum_2 = 0
        old_num_accumulates = num_accumulates
        num_accumulates = 0
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
                  ops::AverageAccumulatesOpMaker,
                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
    average_accumulates,
    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@ -0,0 +1,63 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/average_accumulates_op.h"
 #include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
 namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx, int64_t& num_updates_,
    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
  auto stream = ctx.cuda_device_context().stream();
  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
               sizeof(int64_t), stream);
  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }
 template <>
 void SetAccumulators<paddle::platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx, int64_t num_updates_,
    int64_t num_accumulates_, int64_t old_num_accumulates_) {
  auto stream = ctx.cuda_device_context().stream();
  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
               stream);
  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
               stream);
  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
 }
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    average_accumulates,
    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@ -0,0 +1,113 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename DeviceContext>
 void GetAccumulators(const framework::ExecutionContext& ctx,
                     int64_t& num_updates, int64_t& num_accumulates,
                     int64_t& old_num_accumulates);
 template <typename DeviceContext>
 void SetAccumulators(const framework::ExecutionContext& ctx,
                     int64_t num_updates, int64_t num_accumulates,
                     int64_t old_num_accumulates);
 template <typename DeviceContext, typename T>
 class AverageAccumulatesKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    // It is used to avoid loss of precision
    static const int64_t kMaxNumAccumulates = 16384;
    // Get accumulators from input
    int64_t num_updates = 0;
    int64_t num_accumulates = 0;
    int64_t old_num_accumulates = 0;
    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
                                   old_num_accumulates);
    // Get attrs
    float average_window = ctx.Attr<float>("average_window");
    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
    min_average_window =
        std::min<int64_t>(min_average_window, max_average_window);
    // Get inputs
    auto* param = ctx.Input<Tensor>("param");
    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
    auto param_tensor = EigenVector<T>::Flatten(*param);
    auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
    auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
    auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
    // Get outputs
    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
    // Compute
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
    math::SetConstant<DeviceContext, T> constant_functor;
    ++num_updates;
    ++num_accumulates;
    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
    out_sum_2_tensor.device(place) = in_sum_2_tensor;
    out_sum_3_tensor.device(place) = in_sum_3_tensor;
    if (num_updates % kMaxNumAccumulates == 0) {
      // Move the sum to a different buffer to avoid loss of precision due to
      // too many sums.
      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                       0.0);
    }
    if (num_accumulates >= min_average_window &&
        num_accumulates >= std::min<int64_t>(max_average_window,
                                             num_updates * average_window)) {
      //  Now the average window is too long, discard the old sum.
      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                       0.0);
      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
                       0.0);
      old_num_accumulates = num_accumulates;
      num_accumulates = 0;
    }
    // Set accumulators to output
    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
                                   old_num_accumulates);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/Show More
+++ b/Show More
`@ -1 +1,2 @@`
	`add_subdirectory(v2)`	`add_subdirectory(v2)`
		`add_subdirectory(fluid)`