diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8ea828dd2..3a21574b85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
-include(external/boost) # download, build, install boost
+include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
@@ -156,6 +156,7 @@ include(rdma) # set rdma libraries
include(flags) # set paddle compile flags
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
+include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
new file mode 100644
index 0000000000..98356cd761
--- /dev/null
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -0,0 +1,18 @@
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+# so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
new file mode 100644
index 0000000000..11d00b8f85
--- /dev/null
+++ b/benchmark/cluster/vgg16/README.md
@@ -0,0 +1,76 @@
+# Performance for Distributed vgg16
+
+## Test Result
+
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz : 2101.000
+- cache size : 20480 KB
+
+### Single Node Single Thread
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+
+### Different Batch Size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+
+
+### Accelerate Rate
+
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+
+### Different Pserver Count
+
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+
+*The performance gap between Fuild and v2 comes from the network interference.*
+
+
+## Steps to Run the Performance Test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable Verbos Logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
new file mode 100644
index 0000000000..ee8b0763b6
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@@ -0,0 +1,72 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+ name: vgg16job-pserver
+spec:
+ replicas: 10
+ template:
+ metadata:
+ labels:
+ paddle-job-pserver: vgg16job
+ spec:
+ hostNetwork: true
+ imagePullSecrets:
+ - name: job-registry-secret
+ containers:
+ - name: pserver
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16job
+ - name: MKL_NUM_THREADS
+ value: "1"
+ - name: TRAINING_ROLE
+ value: "PSERVER"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ command: ["paddle_k8s", "start_fluid"]
+ resources:
+ requests:
+ memory: 10Gi
+ cpu: 4
+ limits:
+ memory: 10Gi
+ cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
new file mode 100644
index 0000000000..0a0ed25ebe
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: vgg16job-trainer
+spec:
+ parallelism: 20
+ completions: 20
+ template:
+ metadata:
+ labels:
+ paddle-job: vgg16job
+ spec:
+ imagePullSecrets:
+ - name: job-registry-secret
+ hostNetwork: true
+ containers:
+ - name: trainer
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ command: ["paddle_k8s", "start_fluid"]
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16job
+ - name: TRAINING_ROLE
+ value: "TRAINER"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ resources:
+ requests:
+ memory: 40Gi
+ cpu: 2
+ limits:
+ memory: 40Gi
+ cpu: 2
+ restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
new file mode 100644
index 0000000000..dd1271e0cf
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -0,0 +1,64 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+ name: vgg16v2job-pserver
+spec:
+ replicas: 10
+ template:
+ metadata:
+ labels:
+ paddle-job-pserver: vgg16v2job
+ spec:
+ hostNetwork: true
+ imagePullSecrets:
+ - name: job-registry-secret
+ containers:
+ - name: pserver
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16v2job
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "python train.py"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ command: ["paddle_k8s", "start_pserver"]
+ resources:
+ requests:
+ memory: 10Gi
+ cpu: 4
+ limits:
+ memory: 10Gi
+ cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
new file mode 100644
index 0000000000..12c8964066
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -0,0 +1,65 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: vgg16v2job-trainer
+spec:
+ parallelism: 20
+ completions: 20
+ template:
+ metadata:
+ labels:
+ paddle-job: vgg16v2job
+ spec:
+ imagePullSecrets:
+ - name: job-registry-secret
+ hostNetwork: true
+ containers:
+ - name: trainer
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ command: ["paddle_k8s", "start_trainer", "v2"]
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16v2job
+ - name: BATCH_SIZE
+ value: "256"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "2"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ resources:
+ requests:
+ memory: 40Gi
+ cpu: 2
+ limits:
+ memory: 40Gi
+ cpu: 2
+ restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
new file mode 100644
index 0000000000..499e06ec42
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
+import argparse
+import functools
+import os
+
+
+def str2bool(v):
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
+ return True
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+ return False
+ else:
+ raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+ '--learning_rate',
+ type=float,
+ default=1e-3,
+ help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+ '--device',
+ type=str,
+ default='CPU',
+ choices=['CPU', 'GPU'],
+ help="The device type.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
+parser.add_argument(
+ '--data_format',
+ type=str,
+ default='NCHW',
+ choices=['NCHW', 'NHWC'],
+ help='The data order, now only support NCHW.')
+parser.add_argument(
+ '--data_set',
+ type=str,
+ default='cifar10',
+ choices=['cifar10', 'flowers'],
+ help='Optional dataset for benchmark.')
+parser.add_argument(
+ '--local',
+ type=str2bool,
+ default=True,
+ help='Whether to run as local mode.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+ def conv_block(input, num_filter, groups, dropouts):
+ return fluid.nets.img_conv_group(
+ input=input,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act='relu',
+ conv_with_batchnorm=True,
+ conv_batchnorm_drop_rate=dropouts,
+ pool_type='max')
+
+ conv1 = conv_block(input, 64, 2, [0.3, 0])
+ conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+ conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+ conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+ conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+ drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+ fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+ bn = fluid.layers.batch_norm(input=fc1, act='relu')
+ drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+ fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+ return fc2
+
+
+def main():
+ if args.data_set == "cifar10":
+ classdim = 10
+ if args.data_format == 'NCHW':
+ data_shape = [3, 32, 32]
+ else:
+ data_shape = [32, 32, 3]
+ else:
+ classdim = 102
+ if args.data_format == 'NCHW':
+ data_shape = [3, 224, 224]
+ else:
+ data_shape = [224, 224, 3]
+
+ # Input data
+ images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ # Train program
+ net = vgg16_bn_drop(images)
+ predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+ # inference program
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ test_target = accuracy.metrics + accuracy.states
+ inference_program = fluid.io.get_inference_program(test_target)
+
+ # Optimization
+ optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+ optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+ # Initialize executor
+ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+ args.device_id)
+ exe = fluid.Executor(place)
+
+ # test
+ def test(exe):
+ accuracy.reset(exe)
+ for batch_id, data in enumerate(test_reader()):
+ img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+ data)).astype("float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ exe.run(inference_program,
+ feed={"pixel": img_data,
+ "label": y_data})
+
+ return accuracy.eval(exe)
+
+ def train_loop(exe, trainer_prog):
+ iters = 0
+ ts = time.time()
+ for pass_id in range(args.num_passes):
+ # train
+ start_time = time.time()
+ num_samples = 0
+ accuracy.reset(exe)
+ with profiler.profiler("CPU", 'total') as prof:
+ for batch_id, data in enumerate(train_reader()):
+ ts = time.time()
+ img_data = np.array(
+ map(lambda x: x[0].reshape(data_shape), data)).astype(
+ "float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ loss, acc = exe.run(
+ trainer_prog,
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[avg_cost] + accuracy.metrics)
+ iters += 1
+ num_samples += len(data)
+ print(
+ "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+ % (pass_id, iters, loss, acc, time.time() - ts)
+ ) # The accuracy is the accumulation of batches, but not the current batch.
+
+ pass_elapsed = time.time() - start_time
+ pass_train_acc = accuracy.eval(exe)
+ pass_test_acc = test(exe)
+ print(
+ "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+ % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+ pass_test_acc))
+
+ if args.local:
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+ else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+ train_loop(exe, fluid.default_main_program())
+ else:
+ pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
+ eplist = []
+ for ip in pserver_ips.split(","):
+ eplist.append(':'.join([ip, "6174"]))
+ pserver_endpoints = ",".join(eplist)
+ print("pserver endpoints: ", pserver_endpoints)
+ trainers = int(os.getenv("TRAINERS")) # total trainer count
+ print("trainers total: ", trainers)
+ current_endpoint = os.getenv(
+ "POD_IP") + ":6174" # current pserver endpoint
+ training_role = os.getenv(
+ "TRAINING_ROLE",
+ "TRAINER") # get the training role: trainer/pserver
+ t = fluid.DistributeTranspiler()
+ t.transpile(
+ optimize_ops,
+ params_grads,
+ pservers=pserver_endpoints,
+ trainers=trainers)
+
+ if training_role == "PSERVER":
+ if not current_endpoint:
+ print("need env SERVER_ENDPOINT")
+ exit(1)
+ pserver_prog = t.get_pserver_program(current_endpoint)
+ pserver_startup = t.get_startup_program(current_endpoint,
+ pserver_prog)
+ print("starting server side startup")
+ exe.run(pserver_startup)
+ print("starting parameter server...")
+ exe.run(pserver_prog)
+ elif training_role == "TRAINER":
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+ else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+ paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+
+ trainer_prog = t.get_trainer_program()
+ feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+ # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+ exe.run(fluid.default_startup_program())
+ train_loop(exe, trainer_prog)
+ else:
+ print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+def print_arguments():
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+ print_arguments()
+ main()
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
new file mode 100644
index 0000000000..6ac6b3c332
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import gzip
+
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+ BATCH_SIZE = int(BATCH_SIZE)
+else:
+ BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+
+
+def vgg(input, nums, class_dim):
+ def conv_block(input, num_filter, groups, num_channels=None):
+ return paddle.networks.img_conv_group(
+ input=input,
+ num_channels=num_channels,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act=paddle.activation.Relu(),
+ pool_type=paddle.pooling.Max())
+
+ assert len(nums) == 5
+ # the channel of input feature is 3
+ conv1 = conv_block(input, 64, nums[0], 3)
+ conv2 = conv_block(conv1, 128, nums[1])
+ conv3 = conv_block(conv2, 256, nums[2])
+ conv4 = conv_block(conv3, 512, nums[3])
+ conv5 = conv_block(conv4, 512, nums[4])
+
+ fc_dim = 512
+ fc1 = paddle.layer.fc(input=conv5,
+ size=fc_dim,
+ act=paddle.activation.Relu(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5))
+ fc2 = paddle.layer.fc(input=fc1,
+ size=fc_dim,
+ act=paddle.activation.Relu(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5))
+ out = paddle.layer.fc(input=fc2,
+ size=class_dim,
+ act=paddle.activation.Softmax())
+ return out
+
+
+def vgg13(input, class_dim):
+ nums = [2, 2, 2, 2, 2]
+ return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+ nums = [2, 2, 3, 3, 3]
+ return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+ nums = [2, 2, 4, 4, 4]
+ return vgg(input, nums, class_dim)
+
+
+def main():
+ global ts
+ paddle.init(use_gpu=False)
+ image = paddle.layer.data(
+ name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+ lbl = paddle.layer.data(
+ name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+ extra_layers = None
+ # NOTE: for v2 distributed training need averaging updates.
+ learning_rate = 1e-3 / NODE_COUNT
+ out = vgg16(image, class_dim=CLASS_DIM)
+ cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+ # Create parameters
+ parameters = paddle.parameters.create(cost)
+
+ # Create optimizer
+ optimizer = paddle.optimizer.Momentum(
+ momentum=0.9,
+ regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+ BATCH_SIZE),
+ learning_rate=learning_rate / BATCH_SIZE,
+ learning_rate_decay_a=0.1,
+ learning_rate_decay_b=128000 * 35,
+ learning_rate_schedule="discexp", )
+
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ cifar.train10(),
+ # To use other data, replace the above line with:
+ # reader.train_reader('train.list'),
+ buf_size=1000),
+ batch_size=BATCH_SIZE)
+ test_reader = paddle.batch(
+ cifar.test10(),
+ # To use other data, replace the above line with:
+ # reader.test_reader('val.list'),
+ batch_size=BATCH_SIZE)
+
+ # Create trainer
+ trainer = paddle.trainer.SGD(cost=cost,
+ parameters=parameters,
+ update_equation=optimizer,
+ extra_layers=extra_layers,
+ is_local=False)
+
+ # End batch and end pass event handler
+ def event_handler(event):
+ global ts, ts_pass
+ if isinstance(event, paddle.event.BeginPass):
+ ts_pass = time.time()
+ if isinstance(event, paddle.event.BeginIteration):
+ ts = time.time()
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 1 == 0:
+ print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics,
+ time.time() - ts)
+ if isinstance(event, paddle.event.EndPass):
+ print "Pass %d end, spent: %f" % (event.pass_id,
+ time.time() - ts_pass)
+ result = trainer.test(reader=test_reader)
+ print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+ trainer.train(
+ reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index c70d83b3f4..dbc676bdac 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index d49c8d6011..6a701e076c 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -28,9 +28,3 @@ endif()
add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
-
-IF(NOT WITH_C_API AND WITH_FLUID)
- INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
- INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
- INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
-ENDIF()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 6094630454..d4f252bb9f 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID)
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 382fbda3b5..0c6b3aafcb 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
IF(ANDROID)
INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 365a370a9c..ff5855052d 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
CACHE FILEPATH "protoc library." FORCE)
- IF(WITH_C_API OR WITH_FLUID)
+ IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7cb4efa7bf..5fa60df7b3 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
+ -DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
new file mode 100644
index 0000000000..7d53554358
--- /dev/null
+++ b/cmake/inference_lib.cmake
@@ -0,0 +1,90 @@
+# make package for paddle fluid shared and static library
+function(copy TARGET)
+ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs SRCS DSTS DEPS)
+ cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+ list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+ list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+ if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+ message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
+ endif()
+ math(EXPR len "${copy_lib_SRCS_len} - 1")
+
+ add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
+ foreach(index RANGE ${len})
+ list(GET copy_lib_SRCS ${index} src)
+ list(GET copy_lib_DSTS ${index} dst)
+ add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
+ if(IS_DIRECTORY ${src})
+ add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
+ else()
+ add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
+ endif()
+ endforeach()
+endfunction()
+
+# third party
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+copy(eigen3_lib
+ SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+ DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+copy(gflags_lib
+ SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+ DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+copy(glog_lib
+ SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+ DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+IF(NOT PROTOBUF_FOUND)
+ set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+ copy(protobuf_lib
+ SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+ DSTS ${dst_dir} ${dst_dir}/lib
+ )
+ENDIF(NOT PROTOBUF_FOUND)
+
+# paddle fluid module
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(module "framework")
+copy(framework_lib DEPS framework_py_proto
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+
+set(module "memory")
+copy(memory_lib
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+)
+
+set(module "inference")
+copy(inference_lib DEPENDS paddle_fluid_shared
+ SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
+set(module "platform")
+copy(platform_lib
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+)
+
+set(module "string")
+copy(string_lib
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+)
+
+add_custom_target(inference_lib_dist DEPENDS
+ inference_lib framework_lib memory_lib platform_lib string_lib
+ gflags_lib glog_lib protobuf_lib eigen3_lib)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 94dd3457fb..58ce5d61c9 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/api/CMakeLists.txt b/doc/api/CMakeLists.txt
new file mode 100644
index 0000000000..4e0bc1d5b8
--- /dev/null
+++ b/doc/api/CMakeLists.txt
@@ -0,0 +1,20 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+ "${BINARY_BUILD_DIR_EN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_api_docs
+ html
+ ${BINARY_BUILD_DIR_EN}
+ ${SPHINX_CACHE_DIR_EN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_EN})
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index ddf0b055a9..29388f5005 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -87,6 +87,11 @@ roi_pool
.. autoclass:: paddle.v2.layer.roi_pool
:noindex:
+pad
+----
+.. autoclass:: paddle.v2.layer.pad
+ :noindex:
+
Norm Layer
==========
@@ -133,6 +138,11 @@ grumemory
.. autoclass:: paddle.v2.layer.grumemory
:noindex:
+gated_unit
+-----------
+.. autoclass:: paddle.v2.layer.gated_unit
+ :noindex:
+
Recurrent Layer Group
=====================
@@ -340,6 +350,11 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp
:noindex:
+dropout
+--------
+.. autoclass:: paddle.v2.layer.dropout
+ :noindex:
+
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
@@ -402,6 +417,11 @@ scale_shift
.. autoclass:: paddle.v2.layer.scale_shift
:noindex:
+factorization_machine
+---------------------
+.. autoclass:: paddle.v2.layer.factorization_machine
+ :noindex:
+
Sampling Layers
===============
@@ -420,22 +440,6 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex
:noindex:
-Factorization Machine Layer
-============================
-
-factorization_machine
----------------------
-.. autoclass:: paddle.v2.layer.factorization_machine
- :noindex:
-
-Slicing and Joining Layers
-==========================
-
-pad
-----
-.. autoclass:: paddle.v2.layer.pad
- :noindex:
-
.. _api_v2.layer_costs:
Cost Layers
@@ -526,6 +530,11 @@ multibox_loss
.. autoclass:: paddle.v2.layer.multibox_loss
:noindex:
+detection_output
+----------------
+.. autoclass:: paddle.v2.layer.detection_output
+ :noindex:
+
Check Layer
============
@@ -534,31 +543,10 @@ eos
.. autoclass:: paddle.v2.layer.eos
:noindex:
-Miscs
-=====
-
-dropout
---------
-.. autoclass:: paddle.v2.layer.dropout
- :noindex:
-
-Activation with learnable parameter
-===================================
+Activation
+==========
prelu
--------
.. autoclass:: paddle.v2.layer.prelu
:noindex:
-
-gated_unit
------------
-.. autoclass:: paddle.v2.layer.gated_unit
- :noindex:
-
-Detection output Layer
-======================
-
-detection_output
-----------------
-.. autoclass:: paddle.v2.layer.detection_output
- :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
index 6a8ecc5bb1..02e41564b1 100644
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@@ -73,3 +73,10 @@ wmt14
.. automodule:: paddle.v2.dataset.wmt14
:members:
:noindex:
+
+wmt16
++++++
+
+.. automodule:: paddle.v2.dataset.wmt16
+ :members:
+ :noindex:
diff --git a/doc/howto/dev/build_cn.md b/doc/build_and_install/build_cn.md
similarity index 100%
rename from doc/howto/dev/build_cn.md
rename to doc/build_and_install/build_cn.md
diff --git a/doc/howto/dev/build_en.md b/doc/build_and_install/build_en.md
similarity index 100%
rename from doc/howto/dev/build_en.md
rename to doc/build_and_install/build_en.md
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/build_from_source_cn.rst
rename to doc/build_and_install/build_from_source_cn.rst
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/build_from_source_en.rst
rename to doc/build_and_install/build_from_source_en.rst
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/build_and_install/docker_install_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/docker_install_cn.rst
rename to doc/build_and_install/docker_install_cn.rst
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/build_and_install/docker_install_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/docker_install_en.rst
rename to doc/build_and_install/docker_install_en.rst
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/build_and_install/index_cn.rst
similarity index 94%
rename from doc/getstarted/build_and_install/index_cn.rst
rename to doc/build_and_install/index_cn.rst
index c9ba84c842..4220ff2279 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式:
pip_install_cn.rst
docker_install_cn.rst
- ../../howto/dev/build_cn.md
+ build_cn.md
编译流程
++++++++
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/build_and_install/index_en.rst
similarity index 95%
rename from doc/getstarted/build_and_install/index_en.rst
rename to doc/build_and_install/index_en.rst
index 32d66d63dd..db6b5be742 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
@@ -13,7 +13,7 @@ You can choose either pip or Docker to complete your install:
pip_install_en.rst
docker_install_en.rst
- ../../howto/dev/build_en.md
+ build_en.md
Build from Source
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/build_and_install/paddleci.png
similarity index 100%
rename from doc/getstarted/build_and_install/paddleci.png
rename to doc/build_and_install/paddleci.png
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/build_and_install/pip_install_cn.rst
similarity index 100%
rename from doc/getstarted/build_and_install/pip_install_cn.rst
rename to doc/build_and_install/pip_install_cn.rst
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/build_and_install/pip_install_en.rst
similarity index 100%
rename from doc/getstarted/build_and_install/pip_install_en.rst
rename to doc/build_and_install/pip_install_en.rst
diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md
index f9991541bc..773b7b6a76 100644
--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
@@ -1,23 +1,23 @@
-## Auto Gradient Checker Design
+## Auto Gradient Check Design
-## Backgraound:
-- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
- 1. you should get the right backpropagation formula according to the forward computation.
- 2. you should implement it right in CPP.
- 3. it's difficult to prepare test data.
+## Background:
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+ 1. The formula for backpropagation formula should be correct according to the forward computation.
+ 2. The Implementation of the above shoule be correct in CPP.
+ 3. It is difficult to prepare an unbiased test data.
-- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
- 1. numerical gradient checker only need forward operator.
- 2. user only need to prepare the input data for forward Operator.
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+ 1. Numerical gradient checker only needs the forward operator.
+ 2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
## Mathematical Theory
-The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
-## Numeric Gradient Implementation
+## Numerical Gradient Implementation
### Python Interface
```python
def get_numerical_gradient(op,
@@ -27,73 +27,76 @@ def get_numerical_gradient(op,
delta=0.005,
local_scope=None):
"""
- Get Numeric Gradient for an operator's input.
+ Get Numerical Gradient for the input of an operator.
- :param op: C++ operator instance, could be an network
+ :param op: C++ operator instance, could be an network.
:param input_values: The input variables. Should be an dictionary, whose key is
- variable name, and value is numpy array.
+ variable name, and value is a numpy array.
:param output_name: The final output variable name.
- :param input_to_check: The input variable with respect to which to compute the gradient.
- :param delta: The perturbation value for numeric gradient method. The
- smaller delta is, the more accurate result will get. But if that delta is
- too small, it will suffer from numerical stability problem.
+ :param input_to_check: The input variable with respect to which the gradient has to be computed.
+ :param delta: The perturbation value for numerical gradient method. The
+ smaller the delta, the more accurate the result. But if the delta is too
+ small, it will suffer from the numerical stability problem.
:param local_scope: The local scope used for get_numeric_gradient.
:return: The gradient array in numpy format.
"""
```
-### Explaination:
+### Explanation:
-- Why need `output_name`
- - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
+- Why do we need an `output_name`
+ - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
-- Why need `input_to_check`
- - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+- Why do we need `input_to_check`
+ - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
### Core Algorithm Implementation
```python
- # we only compute gradient of one element a time.
+ # we only compute the gradient of one element a time.
# we use a for loop to compute the gradient of each element.
for i in xrange(tensor_size):
- # get one input element by its index i.
- origin = tensor_to_check.get_float_element(i)
+ # get one input element using the index i.
+ original = tensor_to_check.get_float_element(i)
- # add delta to it, run op and then get the new value of the result tensor.
- x_pos = origin + delta
+ # add delta to it, run the forward op and then
+ # get the new value of the result tensor.
+ x_pos = original + delta
tensor_to_check.set_float_element(i, x_pos)
y_pos = get_output()
- # plus delta to this element, run op and get the new value of the result tensor.
- x_neg = origin - delta
+ # Subtract delta from this element, run the op again
+ # and get the new value of the result tensor.
+ x_neg = original - delta
tensor_to_check.set_float_element(i, x_neg)
y_neg = get_output()
# restore old value
- tensor_to_check.set_float_element(i, origin)
+ tensor_to_check.set_float_element(i, original)
- # compute the gradient of this element and store it into a numpy array.
+ # compute the gradient of this element and store
+ # it into a numpy array.
gradient_flat[i] = (y_pos - y_neg) / delta / 2
# reshape the gradient result to the shape of the source tensor.
return gradient_flat.reshape(tensor_to_check.get_dims())
```
-## Auto Graident Checker Framework
+## Auto Gradient Check Framework
Each Operator Kernel has three kinds of Gradient:
1. Numerical gradient
2. CPU kernel gradient
-3. GPU kernel gradient (if supported)
+3. GPU kernel gradient (if supported by the device)
-The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
-1. calculate the numerical gradient
-2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
-3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
#### Python Interface
@@ -109,26 +112,27 @@ The numerical gradient only relies on forward Operator. So we use the numerical
"""
:param forward_op: used to create backward_op
:param input_vars: numpy value of input variable. The following
- computation will use these variables.
- :param inputs_to_check: the input variable with respect to which to compute the gradient.
+ computation will use these variables.
+ :param inputs_to_check: the input variable with respect to which the
+ gradient will be computed.
:param output_name: The final output variable name.
:param max_relative_error: The relative tolerance parameter.
- :param no_grad_set: used when create backward ops
+ :param no_grad_set: used to create backward ops
:param only_cpu: only compute and check gradient on cpu kernel.
:return:
"""
```
-### How to check if two numpy array is close enough?
-if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
```python
numerical_grad = ...
operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
-# error.
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
@@ -137,10 +141,10 @@ max_diff = numpy.max(diff_mat)
#### Notes:
-The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
-#### Refs:
+#### References:
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/csp.md b/doc/design/csp.md
index ba9cacfdea..10d936860f 100644
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue. In Go, its implemented i
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
-It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
### Type Channel
@@ -71,14 +71,14 @@ ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
In Fluid, we should be able to do the same:
```python
-ch = fluid.make_chan(dtype=INT)
-ch1 = fluid.make_chan(dtype=INT, 100)
+ch = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python
-ch = fluid.make_chan(dtype=Tensor, etype=float16)
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
```
or Tensors of Tensors of float16 etc.
@@ -87,8 +87,136 @@ The point here is that we need a consistent way to compose types, like in C++ we
### Send and Recv
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+ ```go
+ ch := make(chan int) // this is an unbuffered channel
+ ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+ ```
+
+1. Send
+
+ ```go
+ ch <- 111
+ ```
+
+1. Recv
+
+ ```go
+ y, ok <- ch
+ ```
+
+1. Close
+
+ ```go
+ close(ch)
+ ```
+
+ Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+ fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+ fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+ fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
### Select
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1 := make(chan int)
+ch2 := make(chan int, 100)
+
+x := 0
+
+for {
+ select {
+ case ch1 <- x:
+ x := x + 1
+ case y <- ch2:
+ fmt.Println("Received on channel")
+ default:
+ fmt.Println("Default")
+ }
+ }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1 = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+ fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+ fluid.print("Received on Channel")
+
+with sel.default():
+ fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
## Example Programs
### 1. RPC between Trainers and Parameter Servers
diff --git a/doc/design/switch.md b/doc/design/switch.md
new file mode 100644
index 0000000000..827d0601c6
--- /dev/null
+++ b/doc/design/switch.md
@@ -0,0 +1,31 @@
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+ with switch.case(fluid.less_equal(a, 10)):
+ fluid.print("Case 1")
+ with switch.case(fluid.larger(a, 0)):
+ fluid.print("Case 2")
+ with switch.default():
+ fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/howto/dev/FullyConnected.jpg b/doc/dev/FullyConnected.jpg
similarity index 100%
rename from doc/howto/dev/FullyConnected.jpg
rename to doc/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/dev/contribute_to_paddle_cn.md
similarity index 100%
rename from doc/howto/dev/contribute_to_paddle_cn.md
rename to doc/dev/contribute_to_paddle_cn.md
diff --git a/doc/dev/contribute_to_paddle_en.md b/doc/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000..f939e75f21
--- /dev/null
+++ b/doc/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/dev/index_cn.rst b/doc/dev/index_cn.rst
new file mode 100644
index 0000000000..487db868bb
--- /dev/null
+++ b/doc/dev/index_cn.rst
@@ -0,0 +1,8 @@
+开发标准
+========
+
+.. toctree::
+ :maxdepth: 1
+
+ contribute_to_paddle_cn.md
+ write_docs_cn.rst
diff --git a/doc/dev/index_en.rst b/doc/dev/index_en.rst
new file mode 100644
index 0000000000..5dd12d2233
--- /dev/null
+++ b/doc/dev/index_en.rst
@@ -0,0 +1,9 @@
+Development
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ new_layer_en.rst
+ contribute_to_paddle_en.md
+ write_docs_en.rst
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/dev/new_layer_cn.rst
similarity index 100%
rename from doc/howto/dev/new_layer_cn.rst
rename to doc/dev/new_layer_cn.rst
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/dev/new_layer_en.rst
similarity index 100%
rename from doc/howto/dev/new_layer_en.rst
rename to doc/dev/new_layer_en.rst
diff --git a/doc/howto/dev/new_op_cn.md b/doc/dev/new_op_cn.md
similarity index 100%
rename from doc/howto/dev/new_op_cn.md
rename to doc/dev/new_op_cn.md
diff --git a/doc/howto/dev/new_op_en.md b/doc/dev/new_op_en.md
similarity index 100%
rename from doc/howto/dev/new_op_en.md
rename to doc/dev/new_op_en.md
diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/dev/new_op_kernel_en.md
similarity index 100%
rename from doc/howto/dev/new_op_kernel_en.md
rename to doc/dev/new_op_kernel_en.md
diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/dev/use_eigen_cn.md
similarity index 100%
rename from doc/howto/dev/use_eigen_cn.md
rename to doc/dev/use_eigen_cn.md
diff --git a/doc/howto/dev/use_eigen_en.md b/doc/dev/use_eigen_en.md
similarity index 100%
rename from doc/howto/dev/use_eigen_en.md
rename to doc/dev/use_eigen_en.md
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/dev/write_docs_cn.rst
similarity index 98%
rename from doc/howto/dev/write_docs_cn.rst
rename to doc/dev/write_docs_cn.rst
index 1bc947c260..f79769b810 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-##################
-如何贡献/修改文档
-##################
+#############
+如何贡献文档
+#############
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/dev/write_docs_en.rst
similarity index 98%
rename from doc/howto/dev/write_docs_en.rst
rename to doc/dev/write_docs_en.rst
index b3ef07eb1d..f3408a8426 100644
--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/dev/write_docs_en.rst
@@ -1,6 +1,6 @@
-##################
+########################
Contribute Documentation
-##################
+########################
PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index e695ff283e..608f49f5a9 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -4,7 +4,7 @@
PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API,可以轻松地完成神经网络配置,模型训练等任务。
这里将介绍PaddlePaddle的基本使用概念,并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
-在使用该文档之前,请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+在使用该文档之前,请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
配置网络
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index 9f6ee25987..1dc141396b 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,61 +1,8 @@
新手入门
============
-.. _quick_install:
-
-快速安装
-++++++++
-
-PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
-执行下面的命令完成快速安装,版本为cpu_avx_openblas:
-
- .. code-block:: bash
-
- pip install paddlepaddle
-
-如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
-
- .. code-block:: bash
-
- pip install paddlepaddle-gpu
-
-更详细的安装和编译方法参考:
-
-.. toctree::
- :maxdepth: 1
-
- build_and_install/index_cn.rst
-
-.. _quick_start:
-
-快速开始
-++++++++
-
-创建一个 housing.py 并粘贴此Python代码:
-
- .. code-block:: python
-
- import paddle.v2 as paddle
-
- # Initialize PaddlePaddle.
- paddle.init(use_gpu=False, trainer_count=1)
-
- # Configure the neural network.
- x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
- y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
- # Infer using provided test data.
- probs = paddle.infer(
- output_layer=y_predict,
- parameters=paddle.dataset.uci_housing.model(),
- input=[item for item in paddle.dataset.uci_housing.test()()])
-
- for i in xrange(len(probs)):
- print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
-
.. toctree::
:maxdepth: 1
+ quickstart_cn.rst
concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 063d9d880c..c680e19037 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,61 +1,7 @@
GET STARTED
============
-.. _quick_install:
-
-Quick Install
-----------------------
-
-You can use pip to install PaddlePaddle with a single command, supports
-CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
-Simply run the following command to install, the version is cpu_avx_openblas:
-
- .. code-block:: bash
-
- pip install paddlepaddle
-
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
-
- .. code-block:: bash
-
- pip install paddlepaddle-gpu
-
-For more details about installation and build:
-
.. toctree::
:maxdepth: 1
- build_and_install/index_en.rst
-
-
-.. _quick_start:
-
-Quick Start
-++++++++
-
-Create a new file called housing.py, and paste this Python
-code:
-
-
- .. code-block:: python
-
- import paddle.v2 as paddle
-
- # Initialize PaddlePaddle.
- paddle.init(use_gpu=False, trainer_count=1)
-
- # Configure the neural network.
- x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
- y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
- # Infer using provided test data.
- probs = paddle.infer(
- output_layer=y_predict,
- parameters=paddle.dataset.uci_housing.model(),
- input=[item for item in paddle.dataset.uci_housing.test()()])
-
- for i in xrange(len(probs)):
- print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-Run :code:`python housing.py` and voila! It should print out a list of predictions
-for the test housing data.
+ quickstart_en.rst
diff --git a/doc/getstarted/quickstart_cn.rst b/doc/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000..51dd00f1e8
--- /dev/null
+++ b/doc/getstarted/quickstart_cn.rst
@@ -0,0 +1,41 @@
+快速开始
+========
+
+PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
+执行下面的命令完成快速安装,版本为cpu_avx_openblas:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle
+
+如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考::ref:`install_steps` 。
+
+创建一个 housing.py 并粘贴此Python代码:
+
+ .. code-block:: python
+
+ import paddle.v2 as paddle
+
+ # Initialize PaddlePaddle.
+ paddle.init(use_gpu=False, trainer_count=1)
+
+ # Configure the neural network.
+ x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+ y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+ # Infer using provided test data.
+ probs = paddle.infer(
+ output_layer=y_predict,
+ parameters=paddle.dataset.uci_housing.model(),
+ input=[item for item in paddle.dataset.uci_housing.test()()])
+
+ for i in xrange(len(probs)):
+ print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
diff --git a/doc/getstarted/quickstart_en.rst b/doc/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000..d1bcf82ea0
--- /dev/null
+++ b/doc/getstarted/quickstart_en.rst
@@ -0,0 +1,45 @@
+Quick Start
+============
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+ .. code-block:: python
+
+ import paddle.v2 as paddle
+
+ # Initialize PaddlePaddle.
+ paddle.init(use_gpu=False, trainer_count=1)
+
+ # Configure the neural network.
+ x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+ y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+ # Infer using provided test data.
+ probs = paddle.infer(
+ output_layer=y_predict,
+ parameters=paddle.dataset.uci_housing.model(),
+ input=[item for item in paddle.dataset.uci_housing.test()()])
+
+ for i in xrange(len(probs)):
+ print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/capi/compile_paddle_lib_cn.md
similarity index 99%
rename from doc/howto/usage/capi/compile_paddle_lib_cn.md
rename to doc/howto/capi/compile_paddle_lib_cn.md
index ac5ecffe2e..fd8dec8164 100644
--- a/doc/howto/usage/capi/compile_paddle_lib_cn.md
+++ b/doc/howto/capi/compile_paddle_lib_cn.md
@@ -1,4 +1,4 @@
-## 编译 PaddlePaddle 预测库
+## 安装与编译C-API预测库
### 概述
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/capi/images/csr.png
similarity index 100%
rename from doc/howto/usage/capi/images/csr.png
rename to doc/howto/capi/images/csr.png
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/capi/images/sequence_data.png
similarity index 100%
rename from doc/howto/usage/capi/images/sequence_data.png
rename to doc/howto/capi/images/sequence_data.png
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/capi/images/workflow_of_CAPI.png
similarity index 100%
rename from doc/howto/usage/capi/images/workflow_of_CAPI.png
rename to doc/howto/capi/images/workflow_of_CAPI.png
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/capi/index_cn.rst
similarity index 87%
rename from doc/howto/usage/capi/index_cn.rst
rename to doc/howto/capi/index_cn.rst
index fd774fbc74..e589a6d346 100644
--- a/doc/howto/usage/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
@@ -1,4 +1,4 @@
-PaddlePaddle C-API
+C-API预测库
==================
.. toctree::
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/capi/organization_of_the_inputs_cn.md
similarity index 100%
rename from doc/howto/usage/capi/organization_of_the_inputs_cn.md
rename to doc/howto/capi/organization_of_the_inputs_cn.md
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
similarity index 99%
rename from doc/howto/usage/capi/workflow_of_capi_cn.md
rename to doc/howto/capi/workflow_of_capi_cn.md
index e0a42fff12..a61d2267bf 100644
--- a/doc/howto/usage/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -1,4 +1,4 @@
-## C-API 使用流程
+## C-API使用流程
这篇文档介绍 PaddlePaddle C-API 整体使用流程。
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/cluster/cmd_argument_cn.md
similarity index 56%
rename from doc/howto/usage/cluster/cluster_train_cn.md
rename to doc/howto/cluster/cmd_argument_cn.md
index 0f3db59607..5c575dd5b5 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
@@ -1,41 +1,7 @@
-# 分布式训练
-
-
-## 概述
-
-本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
-
-
-
-- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
-- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
-- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
-
-这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
-
-在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
-
-
-## 环境准备
-
-1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
-
-安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
-```bash
-$ paddle version
-PaddlePaddle 0.10.0, compiled with
- with_avx: ON
- with_gpu: OFF
- with_double: OFF
- with_python: ON
- with_rdma: OFF
- with_timer: OFF
-```
+## 启动参数说明
-下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
+下面以`doc/howto/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
-## 启动参数说明
### 启动参数服务器
执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
```bash
@@ -167,22 +133,3 @@ test.txt-00002
- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。
- `test_data_dir`:包含测试数据集的目录。
-
-## 使用分布式计算平台或工具
-
-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
-- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
-- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
-- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
-
-对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
-
-在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
-
-## 在不同集群中运行
-
- - [fabric集群](fabric_cn.md)
- - [openmpi集群](openmpi_cn.md)
- - [kubernetes单机](k8s_cn.md)
- - [kubernetes distributed分布式](k8s_distributed_cn.md)
- - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/cluster/cmd_argument_en.md
similarity index 58%
rename from doc/howto/usage/cluster/cluster_train_en.md
rename to doc/howto/cluster/cmd_argument_en.md
index f9424f8f1a..06fd571756 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
@@ -1,40 +1,7 @@
-# Distributed Training
-
-## Introduction
-
-In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
-
-
-
-- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
-- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
-- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
-
-PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
-
-When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
-
-## Preparations
-1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
-
-After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
-
-```bash
-$ paddle version
-PaddlePaddle 0.10.0rc, compiled with
- with_avx: ON
- with_gpu: OFF
- with_double: OFF
- with_python: ON
- with_rdma: OFF
- with_timer: OFF
-```
-
-We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
-
## Command-line arguments
+We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
+
### Starting parameter server
Type the below command to start a parameter server which will wait for trainers to connect:
@@ -171,21 +138,3 @@ Your workspace may looks like:
- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
- `test_data_dir`: containing testing data.
-
-## Use cluster platforms or cluster management tools
-
-PaddlePaddle supports running jobs on several platforms including:
-- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
-- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
-- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
-
-We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
-
-These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
-
-## Use different clusters
-
- - [fabric](fabric_en.md)
- - [openmpi](openmpi_en.md)
- - [kubernetes](k8s_en.md)
- - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/cluster/fluid_cluster_train_en.md
similarity index 100%
rename from doc/howto/usage/cluster/fluid_cluster_train_en.md
rename to doc/howto/cluster/fluid_cluster_train_en.md
diff --git a/doc/howto/cluster/index_cn.rst b/doc/howto/cluster/index_cn.rst
new file mode 100644
index 0000000000..c68b2655b6
--- /dev/null
+++ b/doc/howto/cluster/index_cn.rst
@@ -0,0 +1,10 @@
+分布式训练
+==========
+
+.. toctree::
+ :maxdepth: 1
+
+ introduction_cn.md
+ preparations_cn.md
+ cmd_argument_cn.md
+ multi_cluster/index_cn.rst
diff --git a/doc/howto/cluster/index_en.rst b/doc/howto/cluster/index_en.rst
new file mode 100644
index 0000000000..af957e06cd
--- /dev/null
+++ b/doc/howto/cluster/index_en.rst
@@ -0,0 +1,10 @@
+Distributed Training
+====================
+
+.. toctree::
+ :maxdepth: 1
+
+ introduction_en.md
+ preparations_en.md
+ cmd_argument_en.md
+ multi_cluster/index_en.rst
diff --git a/doc/howto/cluster/introduction_cn.md b/doc/howto/cluster/introduction_cn.md
new file mode 100644
index 0000000000..562008a898
--- /dev/null
+++ b/doc/howto/cluster/introduction_cn.md
@@ -0,0 +1,13 @@
+## 概述
+
+本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
+
+
+
+- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
+- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
+- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
+
+这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
diff --git a/doc/howto/cluster/introduction_en.md b/doc/howto/cluster/introduction_en.md
new file mode 100644
index 0000000000..eb70d7cf35
--- /dev/null
+++ b/doc/howto/cluster/introduction_en.md
@@ -0,0 +1,13 @@
+## Introduction
+
+In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+
+
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/cluster/multi_cluster/fabric_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_cn.md
rename to doc/howto/cluster/multi_cluster/fabric_cn.md
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/cluster/multi_cluster/fabric_en.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_en.md
rename to doc/howto/cluster/multi_cluster/fabric_en.md
diff --git a/doc/howto/cluster/multi_cluster/index_cn.rst b/doc/howto/cluster/multi_cluster/index_cn.rst
new file mode 100644
index 0000000000..ef56b6ddb3
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
@@ -0,0 +1,20 @@
+在不同集群中运行
+================
+
+PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
+- `Kubernetes `_ Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
+- `OpenMPI `_ 成熟的高性能并行计算框架。
+- `Fabric `_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+
+对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 `_ 找到。
+
+在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
+
+.. toctree::
+ :maxdepth: 1
+
+ fabric_cn.md
+ openmpi_cn.md
+ k8s_cn.md
+ k8s_distributed_cn.md
+ k8s_aws_cn.md
diff --git a/doc/howto/cluster/multi_cluster/index_en.rst b/doc/howto/cluster/multi_cluster/index_en.rst
new file mode 100644
index 0000000000..dac7aaef08
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_en.rst
@@ -0,0 +1,19 @@
+Use different clusters
+======================
+
+PaddlePaddle supports running jobs on several platforms including:
+- `Kubernetes `_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
+- `OpenMPI `_ Mature high performance parallel computing framework.
+- `Fabric `_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
+
+We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 `_ .
+
+These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
+
+.. toctree::
+ :maxdepth: 1
+
+ fabric_en.md
+ openmpi_en.md
+ k8s_en.md
+ k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/cluster/multi_cluster/k8s_aws_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_cn.md
diff --git a/doc/howto/usage/cluster/k8s_aws_en.md b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_en.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_cn.md b/doc/howto/cluster/multi_cluster/k8s_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_cn.md
diff --git a/doc/howto/usage/cluster/k8s_distributed_cn.md b/doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_distributed_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
diff --git a/doc/howto/usage/cluster/k8s_en.md b/doc/howto/cluster/multi_cluster/k8s_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_en.md
rename to doc/howto/cluster/multi_cluster/k8s_en.md
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/cluster/multi_cluster/openmpi_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_cn.md
rename to doc/howto/cluster/multi_cluster/openmpi_cn.md
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/cluster/multi_cluster/openmpi_en.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_en.md
rename to doc/howto/cluster/multi_cluster/openmpi_en.md
diff --git a/doc/howto/usage/cluster/src/add_security_group.png b/doc/howto/cluster/multi_cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/add_security_group.png
rename to doc/howto/cluster/multi_cluster/src/add_security_group.png
diff --git a/doc/howto/usage/cluster/src/create_efs.png b/doc/howto/cluster/multi_cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/cluster/src/create_efs.png
rename to doc/howto/cluster/multi_cluster/src/create_efs.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s-paddle-arch.png
rename to doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/cluster/src/k8s_data/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_data/README.md b/doc/howto/cluster/multi_cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_data/get_data.sh b/doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/get_data.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_train/README.md b/doc/howto/cluster/multi_cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_train/start.sh b/doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/start_paddle.py b/doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start_paddle.py
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
diff --git a/doc/howto/usage/cluster/src/pserver_and_trainer.png b/doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/cluster/src/pserver_and_trainer.png
rename to doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/cluster/src/route53_create_recordset.png b/doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_recordset.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/cluster/src/route53_create_zone.png b/doc/howto/cluster/multi_cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_zone.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/cluster/src/worker_security_group.png b/doc/howto/cluster/multi_cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/worker_security_group.png
rename to doc/howto/cluster/multi_cluster/src/worker_security_group.png
diff --git a/doc/howto/cluster/preparations_cn.md b/doc/howto/cluster/preparations_cn.md
new file mode 100644
index 0000000000..ce40697e70
--- /dev/null
+++ b/doc/howto/cluster/preparations_cn.md
@@ -0,0 +1,16 @@
+## 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
+
+安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+ with_avx: ON
+ with_gpu: OFF
+ with_double: OFF
+ with_python: ON
+ with_rdma: OFF
+ with_timer: OFF
+```
diff --git a/doc/howto/cluster/preparations_en.md b/doc/howto/cluster/preparations_en.md
new file mode 100644
index 0000000000..4b77b29390
--- /dev/null
+++ b/doc/howto/cluster/preparations_en.md
@@ -0,0 +1,17 @@
+## Preparations
+
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
+
+After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+ with_avx: ON
+ with_gpu: OFF
+ with_double: OFF
+ with_python: ON
+ with_rdma: OFF
+ with_timer: OFF
+```
diff --git a/doc/howto/usage/cluster/src/Dockerfile b/doc/howto/cluster/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/Dockerfile
rename to doc/howto/cluster/src/Dockerfile
diff --git a/doc/howto/usage/cluster/src/efs_mount.png b/doc/howto/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/cluster/src/efs_mount.png
rename to doc/howto/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/managed_policy.png b/doc/howto/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/cluster/src/managed_policy.png
rename to doc/howto/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/cluster/src/trainer.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer.png
rename to doc/howto/cluster/src/trainer.png
diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/cluster/src/trainer_cn.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer_cn.png
rename to doc/howto/cluster/src/trainer_cn.png
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/cluster/src/word2vec/api_train_v2.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2.py
rename to doc/howto/cluster/src/word2vec/api_train_v2.py
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
rename to doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/cluster/src/word2vec/prepare.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/prepare.py
rename to doc/howto/cluster/src/word2vec/prepare.py
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/cmd_parameter/arguments_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_cn.md
rename to doc/howto/cmd_parameter/arguments_cn.md
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/cmd_parameter/arguments_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_en.md
rename to doc/howto/cmd_parameter/arguments_en.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/cmd_parameter/detail_introduction_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_cn.md
rename to doc/howto/cmd_parameter/detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/cmd_parameter/detail_introduction_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_en.md
rename to doc/howto/cmd_parameter/detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/cmd_parameter/index_cn.rst
similarity index 85%
rename from doc/howto/usage/cmd_parameter/index_cn.rst
rename to doc/howto/cmd_parameter/index_cn.rst
index 4c87298211..17b379f629 100644
--- a/doc/howto/usage/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
@@ -1,6 +1,6 @@
.. _cmd_line_index:
-设置命令行参数
+命令行参数设置
===============
.. toctree::
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/cmd_parameter/index_en.rst
similarity index 100%
rename from doc/howto/usage/cmd_parameter/index_en.rst
rename to doc/howto/cmd_parameter/index_en.rst
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/cmd_parameter/use_case_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_cn.md
rename to doc/howto/cmd_parameter/use_case_cn.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/cmd_parameter/use_case_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_en.md
rename to doc/howto/cmd_parameter/use_case_en.md
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
deleted file mode 120000
index c97564d93a..0000000000
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index e0c69f7a6a..0c534f107b 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -1,37 +1,11 @@
-进阶指南
+进阶使用
========
-使用说明
---------
-
-.. toctree::
- :maxdepth: 1
-
- usage/cmd_parameter/index_cn.rst
- usage/cluster/cluster_train_cn.md
- usage/capi/index_cn.rst
-
-开发标准
---------
-
-.. toctree::
- :maxdepth: 1
-
- dev/contribute_to_paddle_cn.md
- dev/write_docs_cn.rst
-
-模型配置
---------
-
-.. toctree::
- :maxdepth: 1
-
- deep_model/rnn/index_cn.rst
-
-性能优化
---------
-
.. toctree::
:maxdepth: 1
+ cmd_parameter/index_cn.rst
+ cluster/index_cn.rst
+ capi/index_cn.rst
+ rnn/index_cn.rst
optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 6d1bf7dfc0..ae8b86f75b 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -1,37 +1,10 @@
HOW TO
=======
-Usage
--------
-
-.. toctree::
- :maxdepth: 1
-
- usage/cmd_parameter/index_en.rst
- usage/cluster/cluster_train_en.md
-
-Development
-------------
-
-.. toctree::
- :maxdepth: 1
-
- dev/new_layer_en.rst
- dev/contribute_to_paddle_en.md
- dev/write_docs_en.rst
-
-Configuration
--------------
-
-.. toctree::
- :maxdepth: 1
-
- deep_model/rnn/index_en.rst
-
-Optimization
--------------
-
.. toctree::
:maxdepth: 1
+ cmd_parameter/index_en.rst
+ cluster/index_en.rst
+ rnn/index_en.rst
optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling_en.md
similarity index 100%
rename from doc/howto/optimization/cpu_profiling.md
rename to doc/howto/optimization/cpu_profiling_en.md
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
index e2b0b0396e..0239eef4f1 100644
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -1,6 +1,6 @@
-==================
-GPU性能分析与调优
-==================
+============
+GPU性能调优
+============
.. contents::
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/rnn/hierarchical_layer_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
rename to doc/howto/rnn/hierarchical_layer_cn.rst
diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
rename to doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/rnn/index_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/index_cn.rst
rename to doc/howto/rnn/index_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/rnn/index_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/index_en.rst
rename to doc/howto/rnn/index_en.rst
diff --git a/doc/howto/deep_model/rnn/recurrent_group_cn.md b/doc/howto/rnn/recurrent_group_cn.md
similarity index 100%
rename from doc/howto/deep_model/rnn/recurrent_group_cn.md
rename to doc/howto/rnn/recurrent_group_cn.md
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/rnn/rnn_config_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_cn.rst
rename to doc/howto/rnn/rnn_config_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/rnn/rnn_config_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_en.rst
rename to doc/howto/rnn/rnn_config_en.rst
diff --git a/doc/howto/deep_model/rnn/src/bi_lstm.jpg b/doc/howto/rnn/src/bi_lstm.jpg
similarity index 100%
rename from doc/howto/deep_model/rnn/src/bi_lstm.jpg
rename to doc/howto/rnn/src/bi_lstm.jpg
diff --git a/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png b/doc/howto/rnn/src/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
rename to doc/howto/rnn/src/encoder-decoder-attention-model.png
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn.dot b/doc/howto/rnn/src/glossary_rnn.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn.dot
rename to doc/howto/rnn/src/glossary_rnn.dot
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot b/doc/howto/rnn/src/glossary_rnn_with_memory.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
rename to doc/howto/rnn/src/glossary_rnn_with_memory.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
rename to doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot b/doc/howto/rnn/src/simple_full_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
rename to doc/howto/rnn/src/simple_full_recurrent.dot
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index ada51c2d73..63a7842858 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -5,7 +5,8 @@ PaddlePaddle 文档
:maxdepth: 1
getstarted/index_cn.rst
+ build_and_install/index_cn.rst
howto/index_cn.rst
+ dev/index_cn.rst
api/index_cn.rst
faq/index_cn.rst
- mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 23b64b6cad..5631381be0 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -5,6 +5,7 @@ PaddlePaddle Documentation
:maxdepth: 1
getstarted/index_en.rst
+ build_and_install/index_en.rst
howto/index_en.rst
+ dev/index_en.rst
api/index_en.rst
- mobile/index_en.rst
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
deleted file mode 100644
index 1d99666e58..0000000000
--- a/doc/mobile/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-.. toctree::
- :maxdepth: 1
-
- cross_compiling_for_android_cn.md
- cross_compiling_for_ios_cn.md
- cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
deleted file mode 100644
index ef421dacad..0000000000
--- a/doc/mobile/index_en.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-.. toctree::
- :maxdepth: 1
-
- cross_compiling_for_android_en.md
- cross_compiling_for_ios_en.md
- cross_compiling_for_raspberry_en.md
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8b71f73c36..35e69dcb20 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -24,6 +24,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
+cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
+
cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
@@ -92,11 +94,4 @@ cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-if(NOT WITH_C_API AND WITH_FLUID)
- file(GLOB FRAMEWORK_HEADERS *.h)
- install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
- install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
- install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
-endif()
-
cc_test(channel_test SRCS channel_test.cc)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 85e693434a..f52a51519f 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -534,7 +534,7 @@ ParamGradInfoMap AppendBackward(
auto root_block = program_desc.MutableBlock(root_block_idx);
std::string fill_one_op_out = GradVarName(target.Name());
- bool is_scalar = target.Shape() == std::vector{1};
+ bool is_scalar = target.GetShape() == std::vector{1};
PADDLE_ENFORCE(is_scalar, "target should be scalar");
VLOG(3) << "backward from loss=" << target.Name()
<< " data_type=" << target.GetDataType();
@@ -565,7 +565,7 @@ ParamGradInfoMap AppendBackward(
auto var = root_block->Var(fill_one_op_out);
var->SetDataType(target.GetDataType());
- var->SetShape(target.Shape());
+ var->SetShape(target.GetShape());
auto& target_grad = retv[target.Name()];
target_grad.name_ = fill_one_op_out;
target_grad.block_idx_ = root_block_idx;
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
index 0570980c5a..b679387b11 100644
--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@@ -23,8 +23,8 @@ namespace framework {
template
class Channel {
public:
- virtual void Send(T*) = 0;
- virtual void Receive(T*) = 0;
+ virtual bool Send(T*) = 0;
+ virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Close() = 0;
virtual ~Channel() {}
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 1510fb8abf..df9e15e22b 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) {
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel(10);
- EXPECT_NE(dynamic_cast*>(ch), nullptr);
- EXPECT_EQ(dynamic_cast*>(ch), nullptr);
+ EXPECT_NE(dynamic_cast *>(ch), nullptr);
+ EXPECT_EQ(dynamic_cast *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel(0);
- EXPECT_EQ(dynamic_cast*>(ch), nullptr);
- EXPECT_NE(dynamic_cast*>(ch), nullptr);
+ EXPECT_EQ(dynamic_cast *>(ch), nullptr);
+ EXPECT_NE(dynamic_cast *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
@@ -48,18 +48,59 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
- ch->Send(&i); // should not block
+ EXPECT_EQ(ch->Send(&i), true); // should not block
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
- ch->Receive(&out); // should not block
+ EXPECT_EQ(ch->Receive(&out), true); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
delete ch;
}
+TEST(Channel, SendOnClosedChannelPanics) {
+ const size_t buffer_size = 10;
+ auto ch = MakeChannel(buffer_size);
+ size_t i = 5;
+ EXPECT_EQ(ch->Send(&i), true); // should not block or panic
+ CloseChannel(ch);
+ EXPECT_EQ(ch->Send(&i), false); // should panic
+ delete ch;
+}
+
+TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
+ const size_t buffer_size = 10;
+ auto ch = MakeChannel(buffer_size);
+
+ for (size_t i = 0; i < buffer_size; ++i) {
+ EXPECT_EQ(ch->Send(&i), true); // sending should not block
+ }
+
+ size_t out;
+ for (size_t i = 0; i < buffer_size / 2; ++i) {
+ EXPECT_EQ(ch->Receive(&out), true); // receiving should not block
+ EXPECT_EQ(out, i);
+ }
+
+ CloseChannel(ch);
+
+ for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
+ EXPECT_EQ(ch->Receive(&out),
+ true); // receving should return residual values.
+ EXPECT_EQ(out, i);
+ }
+
+ for (size_t i = 0; i < buffer_size; ++i) {
+ EXPECT_EQ(ch->Receive(&out),
+ false); // after receiving residual values, return zeros.
+ // Note: we cannot check EXPECT_EQ(out, 0), because C++ doesn't
+ // define zero values like Go does.
+ }
+ delete ch;
+}
+
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel(buffer_size);
@@ -67,7 +108,10 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
- ch->Send(&i); // should not block
+ if (i < buffer_size)
+ EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations
+ else
+ EXPECT_EQ(ch->Send(&i), false);
sum += i;
}
});
@@ -78,3 +122,262 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
t.join();
delete ch;
}
+
+TEST(Channel, SimpleUnbufferedChannelTest) {
+ auto ch = MakeChannel(0);
+ unsigned sum_send = 0;
+ std::thread t([&]() {
+ for (int i = 0; i < 5; i++) {
+ EXPECT_EQ(ch->Send(&i), true);
+ sum_send += i;
+ }
+ });
+ for (int i = 0; i < 5; i++) {
+ int recv;
+ EXPECT_EQ(ch->Receive(&recv), true);
+ EXPECT_EQ(recv, i);
+ }
+
+ CloseChannel(ch);
+ t.join();
+ EXPECT_EQ(sum_send, 10U);
+ delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+// any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+ auto ch = MakeChannel(1);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked because of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data;
+ // All reads should return false
+ EXPECT_EQ(ch->Receive(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
+
+ // Verify that all threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+
+ // Explicitly close the channel
+ // This should unblock all receivers
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+// any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+ auto ch = MakeChannel(1);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+ bool send_success[num_threads];
+
+ // Launches threads that try to write and are blocked because of no readers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ send_success[i] = false;
+ t[i] = std::thread(
+ [&](bool *ended, bool *success) {
+ int data = 10;
+ *success = ch->Send(&data);
+ *ended = true;
+ },
+ &thread_ended[i], &send_success[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
+
+ // Verify that atleast 4 threads are blocked
+ int ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (thread_ended[i] == false) ct++;
+ }
+ // Atleast 4 threads must be blocked
+ EXPECT_GE(ct, 4);
+
+ // Explicitly close the thread
+ // This should unblock all senders
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ // Verify that only 1 send was successful
+ ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (send_success[i]) ct++;
+ }
+ // Only 1 send must be successful
+ EXPECT_EQ(ct, 1);
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+// unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+ auto ch = MakeChannel(0);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked becausew of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data;
+ EXPECT_EQ(ch->Receive(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all the threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+
+ // Explicitly close the thread
+ // This should unblock all receivers
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+// unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+ auto ch = MakeChannel(0);
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked becausew of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data = 10;
+ EXPECT_EQ(ch->Send(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all the threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+
+ // Explicitly close the thread
+ // This should unblock all receivers
+ CloseChannel(ch);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+ delete ch;
+}
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+ auto ch = MakeChannel(0);
+ unsigned sum_send = 0;
+ // Send should block after three iterations
+ // since we only have three receivers.
+ std::thread t([&]() {
+ // Try to send more number of times
+ // than receivers
+ for (int i = 0; i < 4; i++) {
+ ch->Send(&i);
+ sum_send += i;
+ }
+ });
+ for (int i = 0; i < 3; i++) {
+ int recv;
+ ch->Receive(&recv);
+ EXPECT_EQ(recv, i);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
+ EXPECT_EQ(sum_send, 3U);
+
+ CloseChannel(ch);
+ t.join();
+ delete ch;
+}
+
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+ auto ch = MakeChannel(0);
+ unsigned sum_send = 0;
+ unsigned sum_receive = 0;
+ // The receiver should block after 5
+ // iterations, since there are only 5 senders.
+ std::thread t([&]() {
+ for (int i = 0; i < 8; i++) {
+ int recv;
+ ch->Receive(&recv); // should block after the fifth iteration.
+ EXPECT_EQ(recv, i);
+ sum_receive += i;
+ }
+ });
+ for (int i = 0; i < 5; i++) {
+ ch->Send(&i);
+ sum_send += i;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+ EXPECT_EQ(sum_send, 10U);
+ EXPECT_EQ(sum_receive, 10U);
+ // send three more elements
+ for (int i = 5; i < 8; i++) {
+ ch->Send(&i);
+ sum_send += i;
+ }
+
+ CloseChannel(ch);
+ t.join();
+ EXPECT_EQ(sum_send, 28U);
+ EXPECT_EQ(sum_receive, 28U);
+ delete ch;
+}
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index b093e15892..00b63da4da 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
+#include
#include
#include
#include
@@ -30,8 +31,8 @@ class Buffered : public paddle::framework::Channel {
friend void paddle::framework::CloseChannel(Channel*);
public:
- virtual void Send(T*);
- virtual void Receive(T*);
+ virtual bool Send(T*);
+ virtual bool Receive(T*);
virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
@@ -42,17 +43,21 @@ class Buffered : public paddle::framework::Channel {
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::deque channel_;
- bool closed_;
+ std::atomic closed_{false};
Buffered(size_t cap) : cap_(cap), closed_(false) {
PADDLE_ENFORCE_GT(cap, 0);
}
- void NotifyAllSenders(std::unique_lock*);
+ void NotifyAllParticipants(std::unique_lock*);
};
template
-void Buffered::Send(T* item) {
+bool Buffered::Send(T* item) {
+ bool ret = false;
+ if (closed_) {
+ return ret;
+ }
std::unique_lock lock(mu_);
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
@@ -60,27 +65,33 @@ void Buffered::Send(T* item) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
+ ret = true;
}
+ return ret;
}
template
-void Buffered::Receive(T* item) {
+bool Buffered::Receive(T* item) {
std::unique_lock lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
- if (!closed_) {
+ bool ret = false;
+ if (!channel_.empty()) {
*item = std::move(channel_.front());
channel_.pop_front();
- NotifyAllSenders(&lock);
- } else {
- item = nullptr;
+ full_cond_var_.notify_one();
+ ret = true;
}
+ return ret;
}
template
void Buffered::Close() {
+ if (closed_) {
+ return;
+ }
std::unique_lock lock(mu_);
closed_ = true;
- NotifyAllSenders(&lock);
+ NotifyAllParticipants(&lock);
}
template
@@ -88,13 +99,14 @@ Buffered::~Buffered() {
std::unique_lock lock(mu_);
closed_ = true;
channel_.clear();
- NotifyAllSenders(&lock);
+ NotifyAllParticipants(&lock);
}
template
-void Buffered::NotifyAllSenders(std::unique_lock* lock) {
+void Buffered::NotifyAllParticipants(std::unique_lock* lock) {
lock->unlock();
full_cond_var_.notify_all();
+ empty_cond_var_.notify_all();
}
} // namespace details
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index cc2d2e587e..815cebad2d 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
+#include
#include
-#include
#include
#include "paddle/framework/channel.h"
@@ -29,27 +29,123 @@ class UnBuffered : public paddle::framework::Channel {
friend void paddle::framework::CloseChannel(Channel*);
public:
- virtual void Send(T*);
- virtual void Receive(T*);
+ virtual bool Send(T*);
+ virtual bool Receive(T*);
virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
private:
- UnBuffered() {}
+ std::mutex mu_ch_;
+ // Mutex for readers and writers who are waiting for other reader
+ // and writer to complete execution
+ std::recursive_mutex mu_read_, mu_write_;
+ // reader_found_ is set true when a reader is ready to accept data
+ // writer_found_ is set true when a writer is ready to send data
+ // A transaction occurs only when both are true
+ std::atomic reader_found_{false}, writer_found_{false};
+ std::condition_variable cv_channel_;
+ std::condition_variable_any cv_reader_, cv_writer_;
+ T* item{nullptr};
+ std::atomic closed_{false};
+
+ UnBuffered() : closed_(false) {}
+
+ void NotifyAllParticipants(std::unique_lock*);
};
+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template
+bool UnBuffered::Send(T* data) {
+ bool ret = false;
+ if (closed_) {
+ return ret;
+ }
+ // Prevent other writers from entering
+ std::unique_lock writer_lock(mu_write_);
+ writer_found_ = true;
+ std::unique_lock cv_lock(mu_write_);
+ // If writer comes first, it should wait till a reader arrives
+ cv_writer_.wait(cv_lock,
+ [this]() { return reader_found_ == true || closed_; });
+ cv_reader_.notify_one();
+ if (!closed_) {
+ std::unique_lock channel_lock(mu_ch_);
+ item = data;
+ channel_lock.unlock();
+ cv_channel_.notify_one();
+ channel_lock.lock();
+ cv_channel_.wait(channel_lock,
+ [this]() { return item == nullptr || closed_; });
+ ret = true;
+ }
+ writer_found_ = false;
+ return ret;
+}
+
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
template
-void UnBuffered::Send(T* channel_element) {}
+bool UnBuffered::Receive(T* data) {
+ // Prevent other readers from entering
+ std::unique_lock read_lock{mu_read_};
+ reader_found_ = true;
+ std::unique_lock cv_lock{mu_read_};
+ // If reader comes first, it should wait till a writer arrives
+ cv_reader_.wait(cv_lock,
+ [this]() { return writer_found_ == true || closed_; });
+ cv_writer_.notify_one();
+ bool ret = false;
+ if (!closed_) {
+ std::unique_lock lock_ch{mu_ch_};
+ // Reader should wait for the writer to first write its data
+ cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+ if (!closed_) {
+ *data = std::move(*item);
+ item = nullptr;
+ lock_ch.unlock();
+ ret = true;
+ }
+ cv_channel_.notify_one();
+ }
+ reader_found_ = false;
+ return ret;
+}
+// This function implements the sequence of events
+// that take place once the channel is closed.
template
-void UnBuffered::Receive(T*) {}
+void UnBuffered::Close() {
+ if (closed_) {
+ return;
+ }
+ std::unique_lock lock(mu_ch_);
+ item = nullptr;
+ closed_ = true;
+ NotifyAllParticipants(&lock);
+}
+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
template
-void UnBuffered::Close() {}
+UnBuffered::~UnBuffered() {
+ std::unique_lock lock(mu_ch_);
+ item = nullptr;
+ closed_ = true;
+ NotifyAllParticipants(&lock);
+}
+// This function notifies all the readers, writers and
+// the channel condition variables.
template
-UnBuffered::~UnBuffered() {}
+void UnBuffered::NotifyAllParticipants(std::unique_lock* lock) {
+ lock->unlock();
+ cv_writer_.notify_all();
+ cv_channel_.notify_all();
+ cv_reader_.notify_all();
+}
} // namespace details
} // namespace framework
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 9a232b0843..2a88e5a929 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
+#include "paddle/framework/reader.h"
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
@@ -52,11 +53,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
var->GetMutable();
} else if (var_type == proto::VarDesc::PLACE_LIST) {
var->GetMutable();
+ } else if (var_type == proto::VarDesc::READER) {
+ var->GetMutable();
} else {
PADDLE_THROW(
"Variable type %d is not in "
- "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
- " PLACE_LIST]",
+ "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+ "LOD_RANK_TABLE, PLACE_LIST, READER]",
var_type);
}
}
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 5b6ef03f61..d7be1a7352 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -116,6 +116,8 @@ message LoDTensorArrayDesc {
optional int32 lod_level = 2 [ default = 0 ];
}
+message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+
message VarDesc {
enum VarType {
LOD_TENSOR = 1;
@@ -126,13 +128,15 @@ message VarDesc {
LOD_RANK_TABLE = 6;
LOD_TENSOR_ARRAY = 7;
PLACE_LIST = 8;
+ READER = 9;
}
required string name = 1;
required VarType type = 2;
- optional LoDTensorDesc lod_tensor = 3;
- optional TensorDesc selected_rows = 4;
+ optional bool persistable = 3 [ default = false ];
+ optional LoDTensorDesc lod_tensor = 4;
+ optional TensorDesc selected_rows = 5;
optional LoDTensorArrayDesc tensor_array = 6;
- optional bool persistable = 5 [ default = false ];
+ optional ReaderDesc reader = 7;
}
message BlockDesc {
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
index 85caac8dcd..422fbbac48 100644
--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
@@ -60,6 +60,14 @@ class Vector : public std::vector {
T *data() { return std::vector::data(); }
const T *data() const { return std::vector::data(); }
+ T *data(const platform::Place &place) {
+ if (platform::is_cpu_place(place)) {
+ return data();
+ } else {
+ return cuda_data();
+ }
+ }
+
/* Synchronize host vector to device vector */
void CopyToCUDA();
/* Synchronize device vector to host vector */
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 46c50d9250..b51afe499b 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
bool HasOutputs(const std::string &name) const override;
- DDim GetInputDim(const std::string &name) const override;
-
- void SetOutputDim(const std::string &name, const DDim &dim) override;
-
AttrReader Attrs() const override;
const std::vector &Inputs(
@@ -76,6 +72,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void SetDim(const std::string &name, const DDim &dim) override;
+ std::vector GetRepeatedDims(const std::string &name) const override;
+
+ void SetRepeatedDims(const std::string &name,
+ const std::vector &dims) override;
+
const OpDesc &op_;
const BlockDesc &block_;
};
@@ -124,7 +125,7 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
// restore attrs_
for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
std::string attr_name = attr.name();
- // The sub_block referred to by the BLOCK attr hasn't be added
+ // The sub_block referred to by the BLOCK attr hasn't been added
// to ProgramDesc class yet, we skip setting BLOCK attr here.
if (attr.type() != proto::AttrType::BLOCK) {
attrs_[attr_name] = GetAttrValue(attr);
@@ -443,21 +444,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
return true;
}
-DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
- std::vector ddims = GetInputsDim(name);
- auto length = ddims.size();
- PADDLE_ENFORCE_EQ(length, 1UL,
- "Input(%s) should have 1 value, "
- "but it has %d now",
- name, length);
- return ddims[0];
-}
-
-void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
- const DDim &dim) {
- SetOutputsDim(name, {dim});
-}
-
AttrReader CompileTimeInferShapeContext::Attrs() const {
return AttrReader(op_.GetAttrMap());
}
@@ -475,23 +461,48 @@ const std::vector &CompileTimeInferShapeContext::Outputs(
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+ DDim res;
try {
- auto shape = var->Shape();
- if (shape.empty()) {
- return framework::make_ddim({0UL});
- } else {
- return framework::make_ddim(var->Shape());
- }
+ auto shape = var->GetShape();
+ res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
+ return res;
+}
+
+std::vector CompileTimeInferShapeContext::GetRepeatedDims(
+ const std::string &name) const {
+ auto var = block_.FindVarRecursive(name);
+ PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+ std::vector res;
+ try {
+ auto shapes = var->GetShapes();
+ for (const auto &s : shapes) {
+ res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
+ }
+ } catch (...) {
+ VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
+ std::rethrow_exception(std::current_exception());
+ }
+ return res;
}
void CompileTimeInferShapeContext::SetDim(const std::string &name,
const DDim &dim) {
- block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+ block_.FindVarRecursive(name)->SetShape(vectorize(dim));
+}
+
+void CompileTimeInferShapeContext::SetRepeatedDims(
+ const std::string &name, const std::vector &dims) {
+ auto var = block_.FindVarRecursive(name);
+ PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+ std::vector> dim_vec(dims.size());
+ std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize);
+ var->SetShapes(dim_vec);
}
+
bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 4e854f54dd..52387aabd9 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -320,8 +320,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
if (length == 0) {
return false;
}
- PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
- name);
+ PADDLE_ENFORCE_EQ(length, 1UL,
+ "Input %s should not have more than one inputs", name);
auto ipt = ins[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
@@ -333,8 +333,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
if (length == 0) {
return false;
}
- PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
- name);
+ PADDLE_ENFORCE_EQ(length, 1UL,
+ "Output %s should not have more than one inputs", name);
auto ipt = outs[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
@@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
return true;
}
- DDim GetInputDim(const std::string& name) const override {
- return GetDim(op_.Input(name));
- }
-
- void SetOutputDim(const std::string& name, const DDim& dim) override {
- SetDim(op_.Output(name), dim);
- }
-
AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
const std::vector& Inputs(
@@ -429,8 +421,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
} else if (var->IsType()) {
return var->Get().GetCompleteDims();
} else {
- PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
- name, var->Type().name());
+ PADDLE_THROW(
+ "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+ "type_id is %s.",
+ name, var->Type().name());
+ }
+ }
+
+ std::vector GetRepeatedDims(const std::string& name) const override {
+ Variable* var = scope_.FindVar(name);
+ if (var->IsType()) {
+ return var->Get().shapes();
+ } else {
+ PADDLE_THROW(
+ "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
+ "type_id is %s.",
+ name, var->Type().name());
}
}
@@ -446,6 +452,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
}
}
+ void SetRepeatedDims(const std::string& name,
+ const std::vector& dims) override {
+ Variable* var = scope_.FindVar(name);
+ if (var->IsType()) {
+ var->GetMutable()->set_shapes(dims);
+ } else {
+ PADDLE_THROW(
+ "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
+ "type_id is %s.",
+ name, var->Type().name());
+ }
+ }
+
proto::VarDesc::VarType GetVarType(const std::string& name) const override {
auto* var = scope_.FindVar(name);
return ToVarType(var->Type());
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 59947c9f21..9945aee31b 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -53,7 +53,7 @@ TEST(ProgramDesc, copy_ctor) {
ASSERT_NE(copy, var_before);
ASSERT_EQ(copy->Name(), var_before->Name());
ASSERT_EQ(copy->GetType(), var_before->GetType());
- ASSERT_EQ(copy->Shape(), var_before->Shape());
+ ASSERT_EQ(copy->GetShape(), var_before->GetShape());
ASSERT_EQ(copy->Proto()->SerializeAsString(),
var_before->Proto()->SerializeAsString());
};
@@ -117,7 +117,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
ASSERT_NE(restored, var_before);
ASSERT_EQ(restored->Name(), var_before->Name());
ASSERT_EQ(restored->GetType(), var_before->GetType());
- ASSERT_EQ(restored->Shape(), var_before->Shape());
+ ASSERT_EQ(restored->GetShape(), var_before->GetShape());
ASSERT_EQ(restored->Proto()->SerializeAsString(),
var_before->Proto()->SerializeAsString());
};
diff --git a/paddle/framework/reader.cc b/paddle/framework/reader.cc
new file mode 100644
index 0000000000..928b661aaa
--- /dev/null
+++ b/paddle/framework/reader.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/reader.h"
+
+namespace paddle {
+namespace framework {
+
+DDim ReaderBase::shape(size_t idx) const {
+ PADDLE_ENFORCE_LT(
+ idx, shapes_.size(),
+ "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
+ shapes_.size());
+ return shapes_[idx];
+}
+
+void ShuffleReader::ReadNext(std::vector* out) {
+ if (iteration_pos_ >= buffer_.size()) {
+ // Reload buffer with new data
+ buffer_.clear();
+ buffer_.reserve(buffer_size_);
+ for (int i = 0; i < buffer_size_; ++i) {
+ if (reader_->HasNext()) {
+ buffer_.push_back(std::vector());
+ reader_->ReadNext(&buffer_.back());
+ } else {
+ break;
+ }
+ }
+ // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+ // optimize.
+ std::random_shuffle(buffer_.begin(), buffer_.end());
+ iteration_pos_ = 0;
+ }
+ out->clear();
+ if (!buffer_.empty()) {
+ std::swap(*out, buffer_[iteration_pos_++]);
+ }
+ // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+
+void BatchReader::ReadNext(std::vector* out) {
+ buffer_.clear();
+ buffer_.reserve(batch_size_);
+ for (int i = 0; i < batch_size_; ++i) {
+ if (reader_->HasNext()) {
+ buffer_.push_back(std::vector());
+ reader_->ReadNext(&buffer_.back());
+ } else {
+ break;
+ }
+ }
+ // Concat instances
+ out->clear();
+ if (buffer_.empty()) {
+ // if buffer_ is empty, the 'out' will return as an empty vector.
+ return;
+ }
+ int out_num = buffer_[0].size();
+ out->reserve(out_num);
+ for (int j = 0; j < out_num; ++j) {
+ // Merge shape and check date type
+ std::type_index batch_type = buffer_[0][j].type();
+ DDim batch_shape = buffer_[0][j].dims();
+ for (size_t i = 1; i < buffer_.size(); ++i) {
+ std::type_index ins_type = buffer_[i][j].type();
+ DDim ins_shape = buffer_[i][j].dims();
+ PADDLE_ENFORCE_EQ(batch_type, ins_type);
+ PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+ slice_ddim(ins_shape, 1, ins_shape.size()));
+ PADDLE_ENFORCE_GT(ins_shape[0], 0);
+ batch_shape[0] += ins_shape[0];
+ }
+
+ LoDTensor out_tensor;
+ out_tensor.Resize(batch_shape);
+ out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+ int64_t dst_offset = 0;
+
+ // Merge lod and data
+ LoD batch_lod;
+ std::vector top_level_lod({0});
+ for (size_t i = 0; i < buffer_.size(); ++i) {
+ DDim ins_shape = buffer_[i][j].dims();
+ LoD ins_lod = buffer_[i][j].lod();
+ if (i == 0) {
+ batch_lod = ins_lod;
+ } else {
+ PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+ for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+ auto& lod_level = batch_lod[level_idx];
+ for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+ lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+ }
+ }
+ }
+ top_level_lod.push_back(
+ top_level_lod.back() +
+ (ins_lod.empty() ? ins_shape[0] : (ins_lod[0].size() - 1)));
+
+ Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+ Copy(buffer_[i][j], platform::CPUPlace(), &dst);
+ dst_offset += ins_shape[0];
+ }
+ batch_lod.insert(batch_lod.begin(), top_level_lod);
+ out_tensor.set_lod(batch_lod);
+ out->push_back(out_tensor);
+ }
+}
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/reader.h b/paddle/framework/reader.h
new file mode 100644
index 0000000000..534894cfbd
--- /dev/null
+++ b/paddle/framework/reader.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/lod_tensor_array.h"
+
+namespace paddle {
+namespace framework {
+
+class ReaderBase {
+ public:
+ explicit ReaderBase(const std::vector& shapes) : shapes_(shapes) {
+ PADDLE_ENFORCE(!shapes_.empty());
+ }
+ virtual void ReadNext(std::vector* out) = 0;
+ virtual bool HasNext() const = 0;
+
+ virtual void ReInit() = 0;
+
+ DDim shape(size_t idx) const;
+ std::vector shapes() const { return shapes_; }
+ void set_shapes(const std::vector& shapes) { shapes_ = shapes; }
+
+ virtual ~ReaderBase() {}
+
+ protected:
+ std::vector shapes_;
+};
+
+class FileReader : public ReaderBase {
+ public:
+ explicit FileReader(const std::vector& shapes) : ReaderBase(shapes) {}
+};
+
+class DecoratedReader : public ReaderBase {
+ public:
+ explicit DecoratedReader(ReaderBase* reader)
+ : ReaderBase(reader->shapes()), reader_(reader) {
+ PADDLE_ENFORCE_NOT_NULL(reader_);
+ }
+
+ bool HasNext() const override { return reader_->HasNext(); }
+
+ void ReInit() override { reader_->ReInit(); }
+
+ protected:
+ ReaderBase* reader_;
+};
+
+// file readers
+
+template
+class RandomDataGenerator : public FileReader {
+ public:
+ RandomDataGenerator(const std::vector& shapes, float min, float max)
+ : FileReader(shapes), min_(min), max_(max) {
+ PADDLE_ENFORCE_LE(
+ min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+ unsigned int seed = std::random_device()();
+ engine_.seed(seed);
+ dist_ = std::uniform_real_distribution(min_, max_);
+ }
+
+ void ReadNext(std::vector* out) override {
+ out->clear();
+ out->reserve(shapes_.size());
+ for (const DDim& shape : shapes_) {
+ PADDLE_ENFORCE_GE(
+ shape.size(), 2,
+ "The rank of reader's output data should be 2 at least.(Now it's %d)",
+ shape.size());
+ LoDTensor out_tensor;
+ out_tensor.Resize(shape);
+ T* data = out_tensor.mutable_data(platform::CPUPlace());
+ int64_t numel = product(shape);
+ for (int64_t i = 0; i < numel; ++i) {
+ data[i] = dist_(engine_);
+ }
+ out->push_back(out_tensor);
+ }
+ }
+
+ bool HasNext() const override { return true; }
+
+ void ReInit() override { return; }
+
+ private:
+ float min_;
+ float max_;
+ std::minstd_rand engine_;
+ std::uniform_real_distribution dist_;
+};
+
+// decorated readers
+
+class ShuffleReader : public DecoratedReader {
+ public:
+ ShuffleReader(ReaderBase* reader, int buffer_size)
+ : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+ buffer_.reserve(buffer_size);
+ }
+
+ void ReadNext(std::vector* out) override;
+
+ private:
+ int buffer_size_;
+ std::vector> buffer_;
+ size_t iteration_pos_;
+};
+
+class BatchReader : public DecoratedReader {
+ public:
+ BatchReader(ReaderBase* reader, int batch_size)
+ : DecoratedReader(reader), batch_size_(batch_size) {
+ buffer_.reserve(batch_size_);
+ }
+
+ void ReadNext(std::vector* out) override;
+
+ private:
+ int batch_size_;
+ std::vector> buffer_;
+};
+
+// The ReaderHolder is used as readers' unified wrapper,
+// making it easier to access different type readers in Variables.
+class ReaderHolder {
+ public:
+ void Reset(ReaderBase* reader) { reader_.reset(reader); }
+
+ ReaderBase* Get() const { return reader_.get(); }
+
+ void ReadNext(std::vector* out) { reader_->ReadNext(out); }
+ bool HasNext() const { return reader_->HasNext(); }
+ void ReInit() { reader_->ReInit(); }
+
+ DDim shape(size_t idx) const { return reader_->shape(idx); }
+ std::vector shapes() const { return reader_->shapes(); }
+ void set_shapes(const std::vector& shapes) {
+ reader_->set_shapes(shapes);
+ }
+
+ private:
+ std::unique_ptr reader_;
+};
+
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index e53cc0cdab..2f4d450577 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -18,10 +18,28 @@ limitations under the License. */
namespace paddle {
namespace framework {
-std::vector InferShapeContext::GetInputsDim(
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
+ const std::vector &arg_names = Inputs(name);
+ PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+ "Input(%s) should hold one element, but now it holds %d",
+ name, arg_names.size());
+ return this->GetDim(arg_names[0]);
+}
+
+std::vector InferShapeContext::GetInputsDim(
const std::string &name) const {
- const std::vector &names = Inputs(name);
- return GetDims(names);
+ const std::vector &arg_names = Inputs(name);
+ return GetDims(arg_names);
+}
+
+std::vector InferShapeContext::GetReaderDims(
+ const std::string &name) const {
+ const std::vector &arg_names = Inputs(name);
+ PADDLE_ENFORCE_EQ(
+ arg_names.size(), 1UL,
+ "Reader input '%s' should hold one element, but now it holds %d", name,
+ arg_names.size());
+ return this->GetRepeatedDims(arg_names[0]);
}
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@@ -30,15 +48,33 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
return this->GetDim(names[idx]);
}
-void InferShapeContext::SetOutputsDim(
- const std::string &name, const std::vector &dims) {
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
+ auto &arg_names = Outputs(name);
+ PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+ "Output(%s) should hold one element, but now it holds %d",
+ name, arg_names.size());
+ SetDim(arg_names[0], dim);
+}
+
+void InferShapeContext::SetOutputsDim(const std::string &name,
+ const std::vector &dims) {
auto &names = Outputs(name);
SetDims(names, dims);
}
-std::vector InferShapeContext::GetDims(
+void InferShapeContext::SetReaderDims(const std::string &name,
+ const std::vector &dims) {
+ const std::vector &arg_names = Outputs(name);
+ PADDLE_ENFORCE_EQ(
+ arg_names.size(), 1UL,
+ "Reader output '%s' should hold one element, but now it holds %d", name,
+ arg_names.size());
+ return this->SetRepeatedDims(arg_names[0], dims);
+}
+
+std::vector InferShapeContext::GetDims(
const std::vector &names) const {
- std::vector ret;
+ std::vector ret;
ret.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(ret),
@@ -47,7 +83,7 @@ std::vector InferShapeContext::GetDims(
}
void InferShapeContext::SetDims(const std::vector &names,
- const std::vector &dims) {
+ const std::vector &dims) {
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
@@ -57,14 +93,17 @@ void InferShapeContext::SetDims(const std::vector &names,
SetDim(names[i], dims[i]);
}
}
+
std::vector InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
+
std::vector InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
+
std::vector InferShapeContext::GetVarTypes(
const std::vector &names) const {
std::vector retv;
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index f93319d8f2..7bee869852 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -35,14 +35,14 @@ class InferShapeContext {
virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0;
- virtual framework::DDim GetInputDim(const std::string &name) const = 0;
-
- std::vector GetInputsDim(const std::string &name) const;
+ DDim GetInputDim(const std::string &name) const;
+ std::vector GetInputsDim(const std::string &name) const;
+ std::vector GetReaderDims(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
- virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
- void SetOutputsDim(const std::string &name,
- const std::vector &dims);
+ void SetOutputDim(const std::string &name, const DDim &dim);
+ void SetOutputsDim(const std::string &name, const std::vector &dims);
+ void SetReaderDims(const std::string &name, const std::vector &dims);
virtual AttrReader Attrs() const = 0;
virtual const std::vector &Inputs(
@@ -57,15 +57,16 @@ class InferShapeContext {
// Note: In while op, we need this to be public
void SetDims(const std::vector &names,
- const std::vector &dims);
+ const std::vector &dims);
protected:
- virtual framework::DDim GetDim(const std::string &name) const = 0;
- virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
-
- std::vector GetDims(
- const std::vector &names) const;
+ virtual DDim GetDim(const std::string &name) const = 0;
+ virtual void SetDim(const std::string &name, const DDim &dim) = 0;
+ virtual std::vector GetRepeatedDims(const std::string &name) const = 0;
+ virtual void SetRepeatedDims(const std::string &name,
+ const std::vector &dims) = 0;
+ std::vector GetDims(const std::vector &names) const;
std::vector GetVarTypes(
const std::vector &names) const;
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index 4e9b58679d..77d31a1176 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -21,7 +21,8 @@ limitations under the License. */
#include
#include
#include
-
+#include "glog/logging.h"
+#include "paddle/platform/enforce.h"
#include "paddle/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
@@ -31,7 +32,7 @@ namespace framework {
// number of threads.
class ThreadPool {
public:
- typedef std::packaged_task Task;
+ using Task = std::packaged_task()>;
// Returns the singleton of ThreadPool.
static ThreadPool* GetInstance();
@@ -52,9 +53,28 @@ class ThreadPool {
// std::future::wait().
template
std::future Run(Callback fn) {
+ auto f = this->RunAndGetException(fn);
+ return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
+ }
+
+ template
+ std::future> RunAndGetException(
+ Callback fn) {
std::unique_lock lock(mutex_);
- Task task(std::bind(fn));
- std::future f = task.get_future();
+ Task task([fn]() -> std::unique_ptr {
+ try {
+ fn();
+ return nullptr;
+ } catch (platform::EnforceNotMet ex) {
+ return std::unique_ptr(
+ new platform::EnforceNotMet(ex));
+ } catch (...) {
+ LOG(FATAL)
+ << "Unexpected exception is catched in thread pool. All "
+ "throwable exception in Fluid should be an EnforceNotMet.";
+ }
+ });
+ std::future> f = task.get_future();
tasks_.push(std::move(task));
lock.unlock();
scheduled_.notify_one();
@@ -65,6 +85,22 @@ class ThreadPool {
void Wait();
private:
+ struct ExceptionHandler {
+ mutable std::future> future_;
+ explicit ExceptionHandler(
+ std::future>&& f)
+ : future_(std::move(f)) {}
+ void operator()() const {
+ auto ex = this->future_.get();
+ if (ex != nullptr) {
+ LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+ "should use RunAndGetException to handle the exception.\n"
+ "The default exception handler is LOG(FATAL)."
+ << ex->what();
+ }
+ }
+ };
+
DISABLE_COPY_AND_ASSIGN(ThreadPool);
explicit ThreadPool(int num_threads);
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 62ab6593ef..11a4daf2c9 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -26,18 +26,98 @@ void VarDesc::SetShape(const std::vector &dims) {
VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
}
+void VarDesc::SetTensorDescNum(size_t num) {
+ switch (desc_.type()) {
+ case proto::VarDesc::READER: {
+ auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
+ lod_tensors_ptr->Clear();
+ for (size_t i = 0; i < num; ++i) {
+ lod_tensors_ptr->Add();
+ }
+ return;
+ } break;
+ default:
+ PADDLE_THROW(
+ "Setting 'sub_tensor_number' is not supported by the type of var %s.",
+ this->Name());
+ }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+ switch (desc_.type()) {
+ case proto::VarDesc::READER:
+ return desc_.reader().lod_tensor_size();
+ break;
+ default:
+ PADDLE_THROW(
+ "Getting 'sub_tensor_number' is not supported by the type of var %s.",
+ this->Name());
+ }
+}
+
+void VarDesc::SetShapes(
+ const std::vector> &multiple_dims) {
+ if (multiple_dims.size() != GetTensorDescNum()) {
+ VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+ << ") doesn't match the existing tensor number("
+ << GetTensorDescNum()
+ << "). The Reader is going to be reinitialized.";
+ SetTensorDescNum(multiple_dims.size());
+ }
+ std::vector tensors = mutable_tensor_descs();
+ for (size_t i = 0; i < multiple_dims.size(); ++i) {
+ VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+ }
+}
+
+std::vector VarDesc::GetShape() const {
+ return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector> VarDesc::GetShapes() const {
+ std::vector descs = tensor_descs();
+ std::vector> res;
+ res.reserve(descs.size());
+ for (const auto &tensor_desc : descs) {
+ res.push_back(RepeatedToVector(tensor_desc.dims()));
+ }
+ return res;
+}
+
void VarDesc::SetDataType(proto::DataType data_type) {
mutable_tensor_desc()->set_data_type(data_type);
}
-std::vector VarDesc::Shape() const {
- return RepeatedToVector(tensor_desc().dims());
+void VarDesc::SetDataTypes(
+ const std::vector &multiple_data_type) {
+ if (multiple_data_type.size() != GetTensorDescNum()) {
+ VLOG(3) << "WARNING: The number of given data types("
+ << multiple_data_type.size()
+ << ") doesn't match the existing tensor number("
+ << GetTensorDescNum()
+ << "). The Reader is going to be reinitialized.";
+ SetTensorDescNum(multiple_data_type.size());
+ }
+ std::vector tensor_descs = mutable_tensor_descs();
+ for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+ tensor_descs[i]->set_data_type(multiple_data_type[i]);
+ }
}
proto::DataType VarDesc::GetDataType() const {
return tensor_desc().data_type();
}
+std::vector VarDesc::GetDataTypes() const {
+ std::vector descs = tensor_descs();
+ std::vector res;
+ res.reserve(descs.size());
+ for (const auto &tensor_desc : descs) {
+ res.push_back(tensor_desc.data_type());
+ }
+ return res;
+}
+
void VarDesc::SetLoDLevel(int32_t lod_level) {
switch (desc_.type()) {
case proto::VarDesc::LOD_TENSOR:
@@ -47,8 +127,32 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
desc_.mutable_tensor_array()->set_lod_level(lod_level);
break;
default:
- PADDLE_THROW("Tensor type=%d does not support LoDLevel",
- desc_.tensor_array().lod_level());
+ PADDLE_THROW(
+ "Setting 'lod_level' is not supported by the type of var %s.",
+ this->Name());
+ }
+}
+
+void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) {
+ if (multiple_lod_level.size() != GetTensorDescNum()) {
+ VLOG(3) << "WARNING: The number of given lod_levels("
+ << multiple_lod_level.size()
+ << ") doesn't match the existing tensor number("
+ << GetTensorDescNum()
+ << "). The Reader is going to be reinitialized.";
+ SetTensorDescNum(multiple_lod_level.size());
+ }
+ switch (desc_.type()) {
+ case proto::VarDesc::READER: {
+ size_t i = 0;
+ for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+ lod_tensor.set_lod_level(multiple_lod_level[i++]);
+ }
+ } break;
+ default:
+ PADDLE_THROW(
+ "Setting 'lod_levels' is not supported by the type of var %s.",
+ this->Name());
}
}
@@ -59,13 +163,31 @@ int32_t VarDesc::GetLoDLevel() const {
case proto::VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().lod_level();
default:
- PADDLE_THROW("Tensor type=%d does not support LoDLevel",
- desc_.tensor_array().lod_level());
+ PADDLE_THROW(
+ "Getting 'lod_level' is not supported by the type of var %s.",
+ this->Name());
+ }
+}
+
+std::vector VarDesc::GetLoDLevels() const {
+ std::vector res;
+ switch (desc_.type()) {
+ case proto::VarDesc::READER:
+ res.reserve(desc_.reader().lod_tensor_size());
+ for (auto &lod_tensor : desc_.reader().lod_tensor()) {
+ res.push_back(lod_tensor.lod_level());
+ }
+ return res;
+ break;
+ default:
+ PADDLE_THROW(
+ "Getting 'lod_levels' is not supported by the type of var %s.",
+ this->Name());
}
}
const proto::TensorDesc &VarDesc::tensor_desc() const {
- PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+ PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
switch (desc_.type()) {
case proto::VarDesc::SELECTED_ROWS:
return desc_.selected_rows();
@@ -74,13 +196,32 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
case proto::VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().tensor();
default:
- PADDLE_THROW("The type of var %s is unsupported.", this->Name());
+ PADDLE_THROW(
+ "Getting 'tensor_desc' is not supported by the type of var %s.",
+ this->Name());
+ }
+}
+
+std::vector VarDesc::tensor_descs() const {
+ PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+ std::vector res;
+ res.reserve(GetTensorDescNum());
+ switch (desc_.type()) {
+ case proto::VarDesc::READER:
+ for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
+ res.push_back(lod_tensor.tensor());
+ }
+ return res;
+ default:
+ PADDLE_THROW(
+ "Getting 'tensor_descs' is not supported by the type of var "
+ "%s.",
+ this->Name());
}
}
proto::TensorDesc *VarDesc::mutable_tensor_desc() {
- PADDLE_ENFORCE(desc_.has_type(),
- "invoke MutableTensorDesc must after set type");
+ PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
switch (desc_.type()) {
case proto::VarDesc::SELECTED_ROWS:
return desc_.mutable_selected_rows();
@@ -89,8 +230,30 @@ proto::TensorDesc *VarDesc::mutable_tensor_desc() {
case proto::VarDesc::LOD_TENSOR_ARRAY:
return desc_.mutable_tensor_array()->mutable_tensor();
default:
- PADDLE_THROW("Unexpected branch.");
+ PADDLE_THROW(
+ "Getting 'mutable_tensor_desc' is not supported by the type of var "
+ "%s.",
+ this->Name());
}
}
+
+std::vector VarDesc::mutable_tensor_descs() {
+ PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+ std::vector res;
+ res.reserve(GetTensorDescNum());
+ switch (desc_.type()) {
+ case proto::VarDesc::READER:
+ for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+ res.push_back(lod_tensor.mutable_tensor());
+ }
+ return res;
+ default:
+ PADDLE_THROW(
+ "Getting 'tensor_descs' is not supported by the type of var "
+ "%s.",
+ this->Name());
+ }
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 9316b14bb6..72da2fbb0a 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -68,18 +68,34 @@ class VarDesc {
void SetName(std::string name) { desc_.set_name(name); }
+ void SetTensorDescNum(size_t num);
+
+ size_t GetTensorDescNum() const;
+
void SetShape(const std::vector &dims);
+ void SetShapes(const std::vector> &multiple_dims);
+
+ std::vector GetShape() const;
+
+ std::vector> GetShapes() const;
+
void SetDataType(proto::DataType data_type);
- std::vector Shape() const;
+ void SetDataTypes(const std::vector &multiple_data_type);
proto::DataType GetDataType() const;
+ std::vector GetDataTypes() const;
+
void SetLoDLevel(int32_t lod_level);
+ void SetLoDLevels(const std::vector &multiple_lod_level);
+
int32_t GetLoDLevel() const;
+ std::vector GetLoDLevels() const;
+
proto::VarDesc::VarType GetType() const;
void SetType(proto::VarDesc::VarType type);
@@ -90,7 +106,9 @@ class VarDesc {
private:
const proto::TensorDesc &tensor_desc() const;
+ std::vector tensor_descs() const;
proto::TensorDesc *mutable_tensor_desc();
+ std::vector mutable_tensor_descs();
proto::VarDesc desc_;
};
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 5b7a08a087..599d451490 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/reader.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/variable.h"
@@ -31,6 +32,8 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
} else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
return proto::VarDesc_VarType_SELECTED_ROWS;
+ } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+ return proto::VarDesc_VarType_READER;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
}
@@ -40,7 +43,7 @@ template
inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
switch (ToVarType(var.Type())) {
case proto::VarDesc_VarType_LOD_TENSOR:
- visitor(var.Get());
+ visitor(var.Get());
return;
case proto::VarDesc_VarType_LOD_RANK_TABLE:
visitor(var.Get());
@@ -51,6 +54,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
case proto::VarDesc_VarType_SELECTED_ROWS:
visitor(var.Get());
return;
+ case proto::VarDesc_VarType_READER:
+ visitor(var.Get());
+ return;
default:
PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
}
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index cbdbf5335d..a9876cec2a 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -178,19 +178,22 @@ public:
real* inputData = inputs[0].data();
real* filterData = inputs[1].data();
real* outputData = outputs[0].data();
+ real* colData = NULL;
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
TensorShape colShape;
- real* colData = NULL;
- size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
- size_t colWidth = outputHeight * outputWidth;
- // Max col matrix height 256, Max col matrix width 1024
- size_t stepColHeight = std::min(colHeight, static_cast(256));
- size_t stepColWidth = std::min(colWidth, static_cast(2048));
+ // Max col matrix width 4096, Max col matrix size 4M.
+ size_t outputHeightSteps =
+ std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+ size_t maxColWidth = outputHeightSteps * outputWidth;
+ size_t channelSteps =
+ std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+ (size_t)1),
+ inputChannels / groups_);
+ size_t maxColHeight = channelSteps * filterHeight * filterWidth;
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
@@ -199,7 +202,7 @@ public:
outputHeight,
outputWidth});
- resizeBuffer(stepColHeight * stepColWidth * sizeof(real));
+ resizeBuffer(maxColHeight * maxColWidth * sizeof(real));
colData = reinterpret_cast(memory_->getBuf());
}
@@ -209,20 +212,24 @@ public:
(outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_;
- int nStride = colWidth;
- int kStride = colHeight;
+ int nStride = outputHeight * outputWidth;
+ int kStride = inputChannels / groups_ * filterHeight * filterWidth;
for (size_t i = 0; i < batchSize; i++) {
+ filterData = inputs[1].data();
for (size_t g = 0; g < groups_; g++) {
if (needIm2col) {
real beta_ = beta;
- for (size_t colHeightStart = 0; colHeightStart < colHeight;
- colHeightStart += stepColHeight) {
- for (size_t colWidthStart = 0; colWidthStart < colWidth;
- colWidthStart += stepColWidth) {
- int N = std::min(colWidth - colWidthStart, stepColWidth);
- int K = std::min(colHeight - colHeightStart, stepColHeight);
+ for (size_t ic = 0; ic < inputChannels / groups_;
+ ic += channelSteps) {
+ int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+ for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+ int height = std::min(outputHeight - oh, outputHeightSteps);
+
+ int M = outputChannels / groups_;
+ int N = height * outputWidth;
+ int K = channels * filterHeight * filterWidth;
// im2col
- im2col(inputData + g * inputOffset,
+ im2col(inputData,
imShape,
colData,
colShape,
@@ -232,13 +239,12 @@ public:
paddingW(),
dilationH(),
dilationW(),
- colHeightStart,
- K,
- colWidthStart,
+ channels,
+ oh,
+ height,
N);
// gemm
- int M = outputChannels / groups_;
BlasGemm::compute(
false,
false,
@@ -246,12 +252,12 @@ public:
N,
K,
1.0f,
- filterData + g * filterOffset + colHeightStart,
+ filterData + ic * filterHeight * filterWidth,
kStride,
colData,
N,
beta_,
- outputData + g * outputOffset + colWidthStart,
+ outputData + oh * outputWidth,
nStride);
}
beta_ = 1.0;
@@ -266,17 +272,18 @@ public:
N,
K,
1.0f,
- filterData + g * filterOffset,
+ filterData,
K,
- inputData + g * inputOffset,
+ inputData,
N,
beta,
- outputData + g * outputOffset,
+ outputData,
N);
}
+ inputData += inputOffset;
+ outputData += outputOffset;
+ filterData += filterOffset;
}
- inputData += inputChannels * inputHeight * inputWidth;
- outputData += outputChannels * outputHeight * outputWidth;
}
memory_.reset();
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 36a9bcf84e..915119e291 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -111,39 +111,42 @@ public:
int paddingWidth,
int dilationHeight,
int dilationWidth,
- int colHeightStart,
- int colHeightSize,
- int colWidthStart,
- int colWidthSize) {
+ int inputChannels,
+ int colOffset,
+ int colOutputHeight,
+ int colWidth) {
int inputHeight = imShape[1];
int inputWidth = imShape[2];
int filterHeight = colShape[1];
int filterWidth = colShape[2];
int outputWidth = colShape[4];
- for (int colh = 0; colh < colHeightSize; colh++) {
- int wOffset = (colHeightStart + colh) % filterWidth;
- int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
- int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
-
- for (int colw = 0; colw < colWidthSize; colw++) {
- int h = (colWidthStart + colw) / outputWidth;
- int w = (colWidthStart + colw) % outputWidth;
-
- int imRowIdx = h * strideHeight + hOffset * dilationHeight;
- int imColIdx = w * strideWidth + wOffset * dilationWidth;
- if ((imRowIdx - paddingHeight) < 0 ||
- (imRowIdx - paddingHeight) >= inputHeight ||
- (imColIdx - paddingWidth) < 0 ||
- (imColIdx - paddingWidth) >= inputWidth) {
- colData[colh * colWidthSize + colw] = static_cast(0);
- } else {
- imRowIdx += c_im * inputHeight - paddingHeight;
- imColIdx -= paddingWidth;
- colData[colh * colWidthSize + colw] =
- imData[imRowIdx * inputWidth + imColIdx];
+ for (int ic = 0; ic < inputChannels; ic++) {
+ for (int oh = 0; oh < colOutputHeight; oh++) {
+ T* dstData = colData + oh * outputWidth;
+ for (int fh = 0; fh < filterHeight; fh++) {
+ for (int fw = 0; fw < filterWidth; fw++) {
+ int imRowIdx = (oh + colOffset) * strideHeight +
+ fh * dilationHeight - paddingHeight;
+ if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+ memset(dstData, 0, outputWidth * sizeof(T));
+ } else {
+ for (int ow = 0; ow < outputWidth; ow++) {
+ int imColIdx =
+ ow * strideWidth + fw * dilationWidth - paddingWidth;
+ if (imColIdx < 0 || imColIdx >= inputWidth) {
+ dstData[ow] = T(0);
+ } else {
+ dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+ }
+ }
+ }
+ dstData += colWidth;
+ }
}
}
+ colData += filterHeight * filterWidth * colWidth;
+ imData += inputHeight * inputWidth;
}
}
};
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 3ba866dcdd..fe44a8bf79 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
padding,
dilation,
dilation,
+ channels,
0,
- height,
- 0,
- width);
+ outputHeight,
+ outputHeight * outputWidth);
autotest::TensorCheckEqual(*output1, *output2);
}
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index 2289ddc139..654a6119bd 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -13,17 +13,11 @@ add_library(paddle_fluid_shared SHARED io.cc)
target_circle_link_libraries(paddle_fluid_shared
ARCHIVE_START
${GLOB_OP_LIB}
- ARCHIVE_END
- ${FLUID_CORE_MODULES})
+ ${FLUID_CORE_MODULES}
+ ARCHIVE_END)
SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-# install library & headers
-if(NOT WITH_C_API AND WITH_FLUID)
- install(FILES io.h DESTINATION include/paddle/inference)
- install(TARGETS paddle_fluid_shared DESTINATION lib)
-endif()
-
if(WITH_TESTING)
add_subdirectory(tests/book)
endif()
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
index 60ad7af1c0..1ed14b69c8 100644
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -55,7 +55,7 @@ void LoadPersistables(framework::Executor& executor,
VLOG(3) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
- new_var->SetShape(var->Shape());
+ new_var->SetShape(var->GetShape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
new_var->SetLoDLevel(var->GetLoDLevel());
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
index d3798fb8fd..078d72fd99 100644
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -3,5 +3,29 @@ cc_test(test_inference_recognize_digits_mlp
SRCS test_inference_recognize_digits.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+cc_test(test_inference_image_classification_vgg
+ SRCS test_inference_image_classification.cc
+ DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_vgg.inference.model)
+cc_test(test_inference_image_classification_resnet
+ SRCS test_inference_image_classification.cc
+ DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_resnet.inference.model)
+cc_test(test_inference_label_semantic_roles
+ SRCS test_inference_label_semantic_roles.cc
+ DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/label_semantic_roles.inference.model)
+cc_test(test_inference_rnn_encoder_decoder
+ SRCS test_inference_rnn_encoder_decoder.cc
+ DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/rnn_encoder_decoder.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
- PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
+ PROPERTIES DEPENDS test_recognize_digits)
+set_tests_properties(test_inference_image_classification_vgg
+ PROPERTIES DEPENDS test_image_classification_train)
+set_tests_properties(test_inference_image_classification_resnet
+ PROPERTIES DEPENDS test_image_classification_train)
+set_tests_properties(test_inference_label_semantic_roles
+ PROPERTIES DEPENDS test_label_semantic_roles)
+set_tests_properties(test_inference_rnn_encoder_decoder
+ PROPERTIES DEPENDS test_rnn_encoder_decoder)
diff --git a/paddle/inference/tests/book/test_helper.h b/paddle/inference/tests/book/test_helper.h
new file mode 100644
index 0000000000..17c3d58de6
--- /dev/null
+++ b/paddle/inference/tests/book/test_helper.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"
+
+template