Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into no_counter_on_pserver

fix_conll05_bug
typhoonzero 7 years ago
commit 007e6c2518

@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost
include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11

@ -0,0 +1,18 @@
#FROM python:2.7.14
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
RUN apt-get update && apt-get install -y python
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
RUN pip install paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && \
chmod +x /usr/bin/paddle_k8s
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD vgg16_fluid.py vgg16_v2.py /workspace/

@ -0,0 +1,76 @@
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Single Node Single Thread
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | - | - | - | - |
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Per trainer CPU Core: 1
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
| TensorFlow | - | - | - | - |
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
| Trainer Count | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
| TensorFlow | - | - | - | - |
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
| PServer Count | 3 | 6 |10 | 20 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
| TensorFlow | - | - | - | - |
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.

@ -0,0 +1,72 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -0,0 +1,69 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

@ -0,0 +1,64 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -0,0 +1,65 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

File diff suppressed because it is too large Load Diff

@ -0,0 +1,154 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()

@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR})

@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif()
if (cc_library_DEPS)
# Don't need link libwarpctc.so
if ("${cc_library_DEPS};" MATCHES "warpctc;")
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
@ -465,10 +470,10 @@ function(py_test TARGET_NAME)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()

@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
add_subdirectory(api)

@ -0,0 +1,20 @@
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY)
sphinx_add_target(paddle_api_docs
html
${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})

@ -87,6 +87,11 @@ roi_pool
.. autoclass:: paddle.v2.layer.roi_pool
:noindex:
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
Norm Layer
==========
@ -133,6 +138,11 @@ grumemory
.. autoclass:: paddle.v2.layer.grumemory
:noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Recurrent Layer Group
=====================
@ -340,6 +350,11 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp
:noindex:
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
@ -402,6 +417,11 @@ scale_shift
.. autoclass:: paddle.v2.layer.scale_shift
:noindex:
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Sampling Layers
===============
@ -420,22 +440,6 @@ multiplex
.. autoclass:: paddle.v2.layer.multiplex
:noindex:
Factorization Machine Layer
============================
factorization_machine
---------------------
.. autoclass:: paddle.v2.layer.factorization_machine
:noindex:
Slicing and Joining Layers
==========================
pad
----
.. autoclass:: paddle.v2.layer.pad
:noindex:
.. _api_v2.layer_costs:
Cost Layers
@ -526,6 +530,11 @@ multibox_loss
.. autoclass:: paddle.v2.layer.multibox_loss
:noindex:
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
Check Layer
============
@ -534,31 +543,10 @@ eos
.. autoclass:: paddle.v2.layer.eos
:noindex:
Miscs
=====
dropout
--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
Activation with learnable parameter
===================================
Activation
==========
prelu
--------
.. autoclass:: paddle.v2.layer.prelu
:noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:

@ -73,3 +73,10 @@ wmt14
.. automodule:: paddle.v2.dataset.wmt14
:members:
:noindex:
wmt16
+++++
.. automodule:: paddle.v2.dataset.wmt16
:members:
:noindex:

@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue. In Go, its implemented i
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
### Type Channel
@ -71,14 +71,14 @@ ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
In Fluid, we should be able to do the same:
```python
ch = fluid.make_chan(dtype=INT)
ch1 = fluid.make_chan(dtype=INT, 100)
ch = fluid.make_channel(dtype=INT)
ch1 = fluid.make_channel(dtype=INT, 100)
```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python
ch = fluid.make_chan(dtype=Tensor, etype=float16)
ch = fluid.make_channel(dtype=Tensor, etype=float16)
```
or Tensors of Tensors of float16 etc.
@ -87,8 +87,135 @@ The point here is that we need a consistent way to compose types, like in C++ we
### Send and Recv
Go's CSP implementation depends on data type *channel*. There are two types of channels:
1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels.
There are four types of actions with a channel:
1. Create a channel
```go
ch := make(chan int) // this is an unbuffered channel
ch := make(chan int, 100) // this is a buffered channel of 100 ints.
```
1. Send
```go
ch <- 111
```
1. Recv
```go
y, ok <- ch
```
1. Close
```go
close(ch)
```
Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
1. A send to a nil channel blocks forever
1. A receive from a nil channel blocks forever
1. A send to a closed channel panics
1. A receive from a closed channel returns the residual values and then zeros.
In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
The following program illustrates the Python syntax for accessing Fluid buffers.
```python
import fluid
buffer_size = 10
ch = fluid.make_channel(dtype=INT, buffer_size)
# Now write three elements to the channel
with fluid.while(steps=buffer_size):
fluid.send(ch, step)
fluid.close_channel(ch)
with fluid.while(steps=buffer_size):
fluid.print(fluid.recv(ch))
```
The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
```python
import fluid
ch = fluid.make_channel(dtype=INT)
with fluid.go():
fluid.send(ch)
y = fluid.recv(ch)
fluid.close_channel(ch)
```
### Select
In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
```go
ch1 := make(chan int)
ch2 := make(chan int, 100)
x := 0
for {
select {
case ch1 <- x:
x := x + 1
case y <- ch2:
fmt.Println("Received on channel")
default:
fmt.Println("Default")
}
}
```
In Fluid, we should be able to do the same:
```python
ch1 = fluid.make_chan(dtype=INT)
ch2 = fluid.make_chan(dtype=INT, 100)
sel = fluid.select()
with sel.case(ch1, 'w', X):
fluid.layers.increment(X)
with sel.case(ch2, 'r', Y):
fluid.print("Received on Channel")
with sel.default():
fluid.print("Default")
```
In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
## Example Programs
### 1. RPC between Trainers and Parameter Servers

@ -0,0 +1,32 @@
### Design Doc: Switch
### Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
The following example shows the usage of `fluid.switch`.
```python
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
fluid.print("Case 2")
with switch.default():
fluid.print("Case 3")
```
### The Semantics
1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
1. It runs the first matched case, or the default case if there is one.
1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
The above program should print and print only "Case 1".
The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.

@ -92,11 +92,11 @@ paddle.init(
参数说明
- use_gpu **可选默认False**是否启用GPU训练
- trainer_count**必选默认1**,当前训练任务trainer总个数
- trainer_count**必选默认1**,当前trainer的线程数目
- port**必选默认7164**连接到pserver的端口
- ports_num**必选默认1**连接到pserver的端口个数
- ports_num_for_sparse**必选默认0**和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers**必选默认1**,当前训练任务pserver总数
- num_gradient_servers**必选默认1**,当前训练任务trainer总数
- trainer_id**必选默认0**每个trainer的唯一ID从0开始的整数
- pservers**必选默认127.0.0.1**当前训练任务启动的pserver的IP列表多个IP使用“,”隔开

@ -95,11 +95,11 @@ paddle.init(
Parameter Description
- use_gpu: **optional, default False**, set to "True" to enable GPU training.
- trainer_count: **required, default 1**, total count of trainers in the training job.
- trainer_count: **required, default 1**, number of threads in current trainer.
- port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server.
- num_gradient_servers: **required, default 1**, number of trainers in current job.
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

@ -8,4 +8,3 @@ PaddlePaddle 文档
howto/index_cn.rst
api/index_cn.rst
faq/index_cn.rst
mobile/index_cn.rst

@ -7,4 +7,3 @@ PaddlePaddle Documentation
getstarted/index_en.rst
howto/index_en.rst
api/index_en.rst
mobile/index_en.rst

@ -1,9 +0,0 @@
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md

@ -1,9 +0,0 @@
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_ios_en.md
cross_compiling_for_raspberry_en.md

@ -534,7 +534,7 @@ ParamGradInfoMap AppendBackward(
auto root_block = program_desc.MutableBlock(root_block_idx);
std::string fill_one_op_out = GradVarName(target.Name());
bool is_scalar = target.Shape() == std::vector<int64_t>{1};
bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
PADDLE_ENFORCE(is_scalar, "target should be scalar");
VLOG(3) << "backward from loss=" << target.Name()
<< " data_type=" << target.GetDataType();
@ -565,7 +565,7 @@ ParamGradInfoMap AppendBackward(
auto var = root_block->Var(fill_one_op_out);
var->SetDataType(target.GetDataType());
var->SetShape(target.Shape());
var->SetShape(target.GetShape());
auto& target_grad = retv[target.Name()];
target_grad.name_ = fill_one_op_out;
target_grad.block_idx_ = root_block_idx;

@ -23,8 +23,8 @@ namespace framework {
template <typename T>
class Channel {
public:
virtual void Send(T*) = 0;
virtual void Receive(T*) = 0;
virtual bool Send(T*) = 0;
virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Close() = 0;
virtual ~Channel() {}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save