Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/python_doc

7 years ago · df681fd4e0
parent 018d411075 c36dd3b338
commit df681fd4e0
183 changed files with 4343 additions and 2096 deletions
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -4,6 +4,7 @@
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
 | ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
--- a/7
+++ b/7
@ -24,12 +24,12 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y --allow-downgrades \
-    git python-pip python-dev openssh-server bison \
+    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format swig doxygen cmake  \
+    automake locales clang-format swig cmake  \
    liblapack-dev liblapacke-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools libtool ccache && \
@ -76,8 +76,7 @@ RUN easy_install -U pip && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
    pip install opencv-python
 #For docstring checker
 RUN pip install pylint pytest astroid isort
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
 fluid/models/*.pyc
 fluid/logs
 fluid/nohup.out
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@ -0,0 +1,22 @@
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
 RUN pip uninstall -y paddlepaddle && mkdir /workspace
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
 ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD fluid_benchmark.py dataset.py models/ /workspace/
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@ -44,11 +44,25 @@ Currently supported `--model` argument include:
 ## Run Distributed Benchmark on Kubernetes Cluster
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
 have to start all those processes mannually on each node, which is not recommended.
 To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
 download it from
 http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
 build it by your own. Once you've got the "whl" package, put it under the current directory and run:
 ```bash
 docker build -t [your docker image name]:[your docker image tag] .
 ```
 Then push the image to a Docker registry that your Kubernetes cluster can reach.
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
 ```
 Then the yaml files are generated under directory `myjob`, you can run:
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -40,10 +40,7 @@ def parse_args():
    parser.add_argument(
        '--batch_size', type=int, default=32, help='The minibatch size.')
    parser.add_argument(
-        '--learning_rate',
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
        type=float,
        default=0.001,
        help='The minibatch size.')
    # TODO(wuyi): add "--use_fake_data" option back.
    parser.add_argument(
        '--skip_batch_num',
@ -72,6 +69,11 @@ def parse_args():
        type=int,
        default=1,
        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
    parser.add_argument(
        '--cpus',
        type=int,
        default=1,
        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
    parser.add_argument(
        '--data_set',
        type=str,
@ -88,8 +90,8 @@ def parse_args():
        help='If set, use nvprof for CUDA.')
    parser.add_argument(
        '--no_test',
-        action='store_false',
+        action='store_true',
-        help='If set, test the testset during training.')
+        help='If set, do not test the testset during training.')
    parser.add_argument(
        '--memory_optimize',
        action='store_true',
@ -231,13 +233,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
            train_losses.append(loss)
            print("Pass: %d, Iter: %d, Loss: %f\n" %
                  (pass_id, iters, np.mean(train_losses)))
-        train_elapsed = time.time() - start_time
+        print_train_time(start_time, time.time(), num_samples)
-        examples_per_sec = num_samples / train_elapsed
+        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
              (num_samples, train_elapsed, examples_per_sec))
        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
        # evaluation
-        if not args.no_test and batch_acc != None:
+        if not args.no_test and batch_acc:
            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                 batch_acc)
            print(", Test Accuracy: %f" % pass_test_acc)
@ -315,11 +314,8 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
-        train_elapsed = time.time() - start_time
+        print_train_time(start_time, time.time(), num_samples)
-        examples_per_sec = num_samples / train_elapsed
+        if not args.no_test and batch_acc:
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        if not args.no_test and batch_acc != None:
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
@ -329,12 +325,19 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
+    print('----------- Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 def print_train_time(start_time, end_time, num_samples):
    train_elapsed = end_time - start_time
    examples_per_sec = num_samples / train_elapsed
    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
          (num_samples, train_elapsed, examples_per_sec))
 def main():
    args = parse_args()
    print_arguments(args)
@ -342,7 +345,7 @@ def main():
    # the unique trainer id, starting from 0, needed by trainer
    # only
    nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
    if args.use_cprof:
        pr = cProfile.Profile()
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@ -49,7 +49,7 @@ def parse_args():
    parser.add_argument(
        '--fluid', default=1, type=int, help='whether is fluid job')
    parser.add_argument(
-        '--rdma', action='store_ture', help='whether mount rdma libs')
+        '--rdma', action='store_true', help='whether mount rdma libs')
    parser.add_argument(
        '--disttype',
        default="pserver",
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@ -69,15 +69,30 @@ def get_model(args):
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    # Train program
+    if args.device == 'CPU' and args.cpus > 1:
-    predict = cnn_model(images)
+        places = fluid.layers.get_places(args.cpus)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
+        pd = fluid.layers.ParallelDo(places)
-    avg_cost = fluid.layers.mean(x=cost)
+        with pd.do():
-
+            predict = cnn_model(pd.read_input(images))
-    # Evaluator
+            label = pd.read_input(label)
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
-    batch_acc = fluid.layers.accuracy(
+            avg_cost = fluid.layers.mean(x=cost)
-        input=predict, label=label, total=batch_size_tensor)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
            pd.write_output(avg_cost)
            pd.write_output(batch_acc)
        avg_cost, batch_acc = pd()
        avg_cost = fluid.layers.mean(avg_cost)
        batch_acc = fluid.layers.mean(batch_acc)
    else:
        # Train program
        predict = cnn_model(images)
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(x=cost)
        # Evaluator
        batch_acc = fluid.layers.accuracy(input=predict, label=label)
    # inference program
    inference_program = fluid.default_main_program().clone()
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@ -132,18 +132,33 @@ def get_model(args):
    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    predict = model(input, class_dim)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    if args.device == 'CPU' and args.cpus > 1:
-    batch_acc = fluid.layers.accuracy(
+        places = fluid.layers.get_places(args.cpus)
-        input=predict, label=label, total=batch_size_tensor)
+        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            predict = model(pd.read_input(input), class_dim)
            label = pd.read_input(label)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
            pd.write_output(avg_cost)
            pd.write_output(batch_acc)
        avg_cost, batch_acc = pd()
        avg_cost = fluid.layers.mean(avg_cost)
        batch_acc = fluid.layers.mean(batch_acc)
    else:
        predict = model(input, class_dim)
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(x=cost)
        batch_acc = fluid.layers.accuracy(input=predict, label=label)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
+            target_vars=[batch_acc])
    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@ -101,9 +101,8 @@ def get_model(args):
    loss = fluid.layers.mean(x=loss)
    # add acc
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)
+                shape=[1], dtype='int64'))
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
@ -35,71 +36,74 @@ nohup stdbuf -oL nvidia-smi \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=mnist \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=vgg16 \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=vgg16 \
               --device=GPU \
               --batch_size=32 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=resnet \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=resnet \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --model=resnet_imagenet \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=stacked_dynamic_lstm \
               --device=GPU \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
-               --hidden_dim=512 \
+               2>&1 | tee -a logs/lstm_gpu_32.log
               --emb_dim=512 \
               --crop_size=1500 \
               2>&1 | tee -a lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=machine_translation \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -33,10 +33,19 @@ ELSE()
  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()
 # FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
+    # NOTE(wuyi):
    # this package is generated by following steps:
    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
    # 2. submodule update --init
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
@ -49,7 +58,6 @@ ExternalProject_Add(
    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 # FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@ -59,3 +59,21 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:
 save_checkpoint
 ---------------
 ..  autofunction:: paddle.fluid.io.save_checkpoint
    :noindex:
 load_checkpoint
 ---------------
 ..  autofunction:: paddle.fluid.io.load_checkpoint
    :noindex:
 clean_checkpoint
 ----------------
 ..  autofunction:: paddle.fluid.io.clean_checkpoint
    :noindex:
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -181,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:
 is_empty
 --------
 ..  autofunction:: paddle.fluid.layers.is_empty
    :noindex:
 device
 ======
@ -255,6 +261,19 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:
 random_data_generator
 ---------------------
 ..  autofunction:: paddle.fluid.layers.random_data_generator
    :noindex:
 Preprocessor
 ------------
 ..  autoclass:: paddle.fluid.layers.Preprocessor
    :members:
    :noindex:
 nn
 ==
@ -594,6 +613,29 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:
 dice_loss
 ---------
 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:
 resize_bilinear
 ---------------
 ..  autofunction:: paddle.fluid.layers.resize_bilinear
    :noindex:
 gather
 ------
 ..  autofunction:: paddle.fluid.layers.gather
    :noindex:
 random_crop
 -----------
 ..  autofunction:: paddle.fluid.layers.random_crop
    :noindex:
 ops
 ===
@ -742,6 +784,12 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:
 shape
 -----
 ..  autofunction:: paddle.fluid.layers.shape
    :noindex:
 sigmoid
 -------
@ -991,21 +1039,3 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:
 topk
 ----
 ..  autofunction:: paddle.fluid.layers.topk
    :noindex:
 dice_loss
 ----
 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:
 upsampling_bilinear2d
 ____
 ..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
    :noindex:
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@ -47,28 +47,6 @@ DecayedAdagrad
    :members:
    :noindex:
 Adadelta
 -----------------
 ..  autoclass:: paddle.fluid.optimizer.Adadelta
    :members:
    :noindex:
 RMSProp
 -----------------
 ..  autoclass:: paddle.fluid.optimizer.RMSProp
    :members:
    :noindex:
 ModelAverage
 -----------------
 ..  autoclass:: paddle.fluid.optimizer.ModelAverage
    :members:
    :noindex:
 SGDOptimizer
 ------------
@ -111,25 +89,31 @@ DecayedAdagradOptimizer
    :members:
    :noindex:
 RMSPropOptimizer
 ----------------
-AdadeltaOptimizer
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
 -----------------
 ..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
    :members:
    :noindex:
 Adadelta
 --------
-RMSPropOptimizer
+..  autoclass:: paddle.fluid.optimizer.Adadelta
-----------------
+    :members:
    :noindex:
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ModelAverage
 ------------
 ..  autoclass:: paddle.fluid.optimizer.ModelAverage
    :members:
    :noindex:
-    
+
 Optimizer
 ---------
 ..  autoclass:: paddle.fluid.optimizer.Optimizer
    :members:
    :noindex:
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:
 start_profiler
 --------------
 ..  autofunction:: paddle.fluid.profiler.start_profiler
    :noindex:
 stop_profiler
 -------------
 ..  autofunction:: paddle.fluid.profiler.stop_profiler
    :noindex:
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@ -35,7 +35,7 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist
 ## Definition of VarType
-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
 ```proto
 message VarDesc {
--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@ -0,0 +1,127 @@
 # How to use RecordIO in Fluid
 If you want to use RecordIO as your training data format, you need to convert to your training data
 to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
 interface to deal with the RecordIO files.
 ## Generate RecordIO File
 Before start training with RecordIO files, you need to convert your training data
 to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
 as follows:
 ```python
    reader = paddle.batch(mnist.train(), batch_size=1)
    feeder = fluid.DataFeeder(
        feed_list=[  # order is image and label
            fluid.layers.data(
            name='image', shape=[784]),
            fluid.layers.data(
            name='label', shape=[1], dtype='int64'),
        ],
        place=fluid.CPUPlace())
    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
 ```
 The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
 **NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
 adjust it flexibly while reading it.
 ## Use the RecordIO file in a Local Training Job
 PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
 and then you can use them as a Layer in your network configuration, the sample codes as follows:
 ```python
    data_file = fluid.layers.io.open_recordio_file(
        filename="./mnist.recordio",
        shapes=[(-1, 784),(-1, 1)],
        lod_levels=[0, 0],
        dtypes=["float32", "int32"])
    data_file = fluid.layers.io.batch(data_file, batch_size=4)
    img, label = fluid.layers.io.read_file(data_file)
    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_loss = fluid.layers.mean(loss)
    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    avg_loss_np = []
    # train a pass
    batch_id = 0
    while True:
        tmp, = exe.run(fetch_list=[avg_loss])
        avg_loss_np.append(tmp)
        print(batch_id)
        batch_id += 1
 ```
 ## Use the RecordIO files in Distributed Training
 1. generate multiple RecordIO files
 For a distributed training job, you may have multiple trainer nodes,
 and one or more RecordIO files for one trainer node, you can use the interface
 `fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
 into multiple RecordIO files, the sample codes as follows:
 ```python
    reader = paddle.batch(mnist.train(), batch_size=1)
    feeder = fluid.DataFeeder(
        feed_list=[  # order is image and label
            fluid.layers.data(
            name='image', shape=[784]),
            fluid.layers.data(
            name='label', shape=[1], dtype='int64'),
        ],
        place=fluid.CPUPlace())
    fluid.recordio_writer.convert_reader_to_recordio_files(
          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
 ```
 The above codes would generate multiple RecordIO files on your host like:
 ```bash
 .
 \_mnist-00000.recordio
 |-mnist-00001.recordio
 |-mnist-00002.recordio
 |-mnist-00003.recordio
 |-mnist-00004.recordio
 ```
 2. open multiple RecordIO files by `fluid.layers.io.open_files`
 For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
 each trainer process reads parts of the whole training data, we usually take the following approach to make the training
 data allocated by each trainer process as uniform as possiable:
 ```python
 def gen_train_list(file_pattern, trainers, trainer_id):
   file_list = glob.glob(file_pattern)
   ret_list = []
   for idx, f in enumerate(file_list):
       if (idx + trainers) % trainers == trainer_id:
           ret_list.append(f)
   return ret_list
 trainers = int(os.getenv("TRAINERS"))
 trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
 data_file = fluid.layers.io.open_files(
    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
    thread_num=1,
    shapes=[(-1, 784),(-1, 1)],
    lod_levels=[0, 0],
    dtypes=["float32", "int32"])
 img, label = fluid.layers.io.read_file(data_files)
 ...
 ```
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@ -4,5 +4,5 @@
 .. toctree::
  :maxdepth: 1
  inference/index_cn.rst
  optimization/index_cn.rst
  inference/inference_support_in_fluid.md
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@ -5,4 +5,3 @@ HOW TO
  :maxdepth: 1
  optimization/index_en.rst
  inference/inference_support_in_fluid.md
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@ -0,0 +1,96 @@
 安装与编译C++预测库
 ===========================
 直接下载安装
 -------------
 ======================   ========================================
 版本说明                            C++预测库   
 ======================   ========================================
 cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
 cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
 cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
 cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 ======================   ========================================
 从源码编译
 ----------
 用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
 =================   =========
 选项                 值   
 =================   =========
 CMAKE_BUILD_TYPE    Release
 FLUID_INSTALL_DIR   安装路径    
 WITH_FLUID_ONLY     ON（推荐）
 WITH_SWIG_PY        OFF（推荐
 WITH_PYTHON         OFF（推荐）
 WITH_GPU            ON/OFF
 WITH_MKL            ON/OFF
 =================   =========
 建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
 下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
  .. code-block:: bash
     pip install paddlepaddle-gpu
     PADDLE_ROOT=/path/of/capi
     git clone https://github.com/PaddlePaddle/Paddle.git
     cd Paddle
     mkdir build
     cd build
     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
           -DCMAKE_BUILD_TYPE=Release \
           -DWITH_FLUID_ONLY=ON \
           -DWITH_SWIG_PY=OFF \
           -DWITH_PYTHON=OFF \
           -DWITH_MKL=OFF \
           -DWITH_GPU=OFF  \
           ..
      make
      make inference_lib_dist
 成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
 均会存放于PADDLE_ROOT目录中。目录结构如下：
  .. code-block:: text
     PaddleRoot/
     ├── CMakeCache.txt
     ├── paddle
     │   └── fluid
     │       ├── framework
     │       ├── inference
     │       ├── memory
     │       ├── platform
     │       ├── pybind
     │       └── string
     ├── third_party
     │   ├── boost
     │   │   └── boost
     │   ├── eigen3
     │   │   ├── Eigen
     │   │   └── unsupported
     │   └── install
     │       ├── gflags
     │       ├── glog
     │       ├── mklml
     │       ├── protobuf
     │       ├── snappy
     │       ├── snappystream
     │       └── zlib
     └── version.txt
 version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
  .. code-block:: text
     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
     WITH_MKL: ON
     WITH_GPU: ON
     CUDA version: 8.0
     CUDNN version: v5
--- a/doc/fluid/howto/inference/index_cn.rst
+++ b/doc/fluid/howto/inference/index_cn.rst
@ -0,0 +1,8 @@
 预测库
 ------------
 .. toctree::
  :maxdepth: 1
  build_and_install_lib_cn.rst
  inference_support_in_fluid_cn.md
--- a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@ -1,9 +1,8 @@
-# Fluid Inference使用指南
+# 使用指南
 ## 目录：
 - Python Inference API
 - 编译Fluid Inference库
 - Inference C++ API
 - Inference实例
 - Inference计算优化
@ -55,62 +54,6 @@
    return [program, feed_target_names, fetch_targets]
  ```
 ## 编译Fluid Inference库
  - **不需要额外的CMake选项**
    - 1、 配置CMake命令，更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
      ```bash
      $ git clone https://github.com/PaddlePaddle/Paddle.git
      $ cd Paddle
      $ mkdir build
      $ cd build
      $ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
          -DCMAKE_BUILD_TYPE=Release \
          -DWITH_PYTHON=ON \
          -DWITH_MKL=OFF \
          -DWITH_GPU=OFF \
          ..
      ```
    - 2、 编译PaddlePaddle
      ```bash
      $ make
      ```
    - 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
      ```bash
      $ make inference_lib_dist
      ```
 - 目录结构
  ```bash
  $ cd your/path/to/paddle_inference_lib
  $ tree
  .
  |-- paddle
  |   `-- fluid
  |       |-- framework
  |       |-- inference
  |       |   |-- io.h
  |       |   `-- libpaddle_fluid.so
  |       |-- memory
  |       |-- platform
  |       `-- string
  |-- third_party
  |   |-- eigen3
  |   `-- install
  |       |-- gflags
  |       |-- glog
  |       `-- protobuf
  `-- ...
  ```
  假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
 ## 链接Fluid Inference库
 - 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ b/doc/fluid/howto/optimization/benchmark/README.md
@ -1 +0,0 @@
 ../../../../../benchmark/cluster/README.md
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
@ -1 +0,0 @@
 ../../../../../../benchmark/cluster/vgg16/README.md
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`../../../../../../benchmark/cluster/vgg16/README.md`