Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dist_train_benchmark_vgg16

emailweixu-patch-1
typhoonzero 7 years ago
commit 8d9c3fc65c

@ -9,7 +9,7 @@ import subprocess
import platform import platform
COPYRIGHT = ''' COPYRIGHT = '''
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED) find_package(Git REQUIRED)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
if(NOT ANDROID AND NOT IOS)
find_package(Boost QUIET)
endif()
include(simd) include(simd)
@ -42,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
include_directories(${Boost_INCLUDE_DIRS})
set(EXTERNAL_LIBS set(EXTERNAL_LIBS
${GFLAGS_LIBRARIES} ${GFLAGS_LIBRARIES}

@ -27,7 +27,7 @@ RUN apt-get update && \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \ net-tools libtool && \
apt-get clean -y apt-get clean -y

@ -0,0 +1,51 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(BOOST_PROJECT "extern_boost")
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
include_directories(${BOOST_INCLUDE_DIR})
ExternalProject_Add(
${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(boost STATIC ${dummyfile})
else()
add_library(boost INTERFACE)
endif()
add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})

@ -224,12 +224,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
@ -457,7 +463,7 @@ endfunction()
function(py_test TARGET_NAME) function(py_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options STATIC static SHARED shared) set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@ -1,9 +1,14 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
DataFeeder data_feeder
=========== ===========
DataFeeder DataFeeder
----------- ----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder .. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
:members:
:noindex: :noindex:

@ -1,9 +1,21 @@
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Evaluator !DO NOT EDIT THIS FILE MANUALLY!
===========
=========
Evaluator evaluator
----------- =========
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator Accuracy
--------
.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
:members:
:noindex: :noindex:
ChunkEvaluator
--------------
.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
:members:
:noindex:

@ -1,9 +1,32 @@
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Executor !DO NOT EDIT THIS FILE MANUALLY!
===========
========
executor
========
Executor Executor
--------
.. autoclass:: paddle.v2.fluid.executor.Executor
:members:
:noindex:
global_scope
------------
.. autofunction:: paddle.v2.fluid.executor.global_scope
:noindex:
scope_guard
----------- -----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor .. autofunction:: paddle.v2.fluid.executor.scope_guard
:noindex:
switch_scope
------------
.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex: :noindex:

@ -0,0 +1,109 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import sys
import types
import paddle.v2.fluid as fluid
def parse_arg():
parser = argparse.ArgumentParser()
parser.add_argument('--submodules', nargs="*")
parser.add_argument(
'module', type=str, help='Generate the documentation of which module')
return parser.parse_args()
class DocGenerator(object):
def __init__(self, module_name, stream=sys.stdout):
self.stream = stream
self.module_name = module_name
if not hasattr(fluid, module_name):
raise ValueError("Cannot find fluid.{0}".format(module_name))
else:
self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
''')
self._print_header_(module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name)
if submodule is None:
raise ValueError("Cannot find submodule {0}".format(submodule_name))
self.print_section(submodule_name)
for item in submodule.__all__:
self.print_item(item)
def print_current_module(self):
for item in self.module.__all__:
self.print_item(item)
def print_section(self, name):
self._print_header_(name, dot='=', is_title=False)
def print_item(self, name):
item = getattr(self.module, name)
if isinstance(item, types.TypeType):
self.print_class(name)
elif isinstance(item, types.FunctionType):
self.print_method(name)
else:
raise RuntimeError("Unsupported item {0}".format(name))
def print_class(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
:members:
:noindex:
'''.format(self.module_name, name))
def print_method(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
:noindex:
'''.format(self.module_name, name))
def _print_header_(self, name, dot, is_title):
dot_line = dot * len(name)
if is_title:
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write(name)
self.stream.write('\n')
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write('\n')
def main():
args = parse_arg()
gen = DocGenerator(args.module)
if args.submodules is None:
gen.print_current_module()
else:
for submodule_name in args.submodules:
gen.print_submodule(submodule_name)
if __name__ == '__main__':
main()

@ -0,0 +1,7 @@
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
do
python gen_doc.py ${module} > ${module}.rst
done

@ -1,50 +1,35 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Initializer initializer
=========== ===========
Constant
--------
.. autoclass:: paddle.v2.fluid.initializer.Constant
Initializer :members:
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
:noindex: :noindex:
Uniform
-------
.. autoclass:: paddle.v2.fluid.initializer.Uniform
UniformInitializer :members:
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
:noindex: :noindex:
Normal
------
XavierInitializer .. autoclass:: paddle.v2.fluid.initializer.Normal
----------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
:noindex: :noindex:
Xavier
------
MSRAInitializer .. autoclass:: paddle.v2.fluid.initializer.Xavier
--------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
:noindex: :noindex:

@ -1,10 +1,61 @@
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
IO !DO NOT EDIT THIS FILE MANUALLY!
===========
==
io
==
save_vars
---------
is_parameter .. autofunction:: paddle.v2.fluid.io.save_vars
:noindex:
save_params
----------- -----------
.. autofunction:: paddle.v2.fluid.io.is_parameter
.. autofunction:: paddle.v2.fluid.io.save_params
:noindex:
save_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.save_persistables
:noindex:
load_vars
---------
.. autofunction:: paddle.v2.fluid.io.load_vars
:noindex:
load_params
-----------
.. autofunction:: paddle.v2.fluid.io.load_params
:noindex: :noindex:
load_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.load_persistables
:noindex:
save_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.save_inference_model
:noindex:
load_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.load_inference_model
:noindex:
get_inference_program
---------------------
.. autofunction:: paddle.v2.fluid.io.get_inference_program
:noindex:

File diff suppressed because it is too large Load Diff

@ -1,33 +1,31 @@
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Nets !DO NOT EDIT THIS FILE MANUALLY!
===========
====
nets
====
simple_img_conv_pool simple_img_conv_pool
-------------------- --------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
img_conv_group
---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex: :noindex:
sequence_conv_pool sequence_conv_pool
------------------ ------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex: :noindex:
glu glu
--- ---
.. autofunction:: paddle.v2.fluid.nets.glu .. autofunction:: paddle.v2.fluid.nets.glu
:noindex: :noindex:
scaled_dot_product_attention
----------------------------
dot_product_attention .. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
:noindex: :noindex:

@ -1,54 +1,49 @@
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Optimizer !DO NOT EDIT THIS FILE MANUALLY!
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
=========
optimizer
=========
SGDOptimizer SGD
----------- ---
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
.. autoclass:: paddle.v2.fluid.optimizer.SGD
:members:
:noindex:
Momentum
--------
MomentumOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Momentum
----------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex: :noindex:
Adagrad
-------
.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
AdagradOptimizer :members:
----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex: :noindex:
Adam
----
AdamOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adam
------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex: :noindex:
Adamax
------
AdamaxOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adamax
----------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
:noindex: :noindex:
DecayedAdagrad
--------------
DecayedAdagradOptimizer .. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
----------------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex: :noindex:

@ -1,11 +1,21 @@
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
param_attr
==========
ParamAttr ParamAttr
=========== ---------
.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
:members:
:noindex:
WeightNormParamAttr
-------------------
ParamAttr .. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
----------- :members:
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
:noindex: :noindex:

@ -1,10 +1,25 @@
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Profiler !DO NOT EDIT THIS FILE MANUALLY!
===========
========
profiler
========
cuda_profiler
-------------
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler .. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex: :noindex:
reset_profiler
--------------
.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
:noindex:
profiler
--------
.. autofunction:: paddle.v2.fluid.profiler.profiler
:noindex:

@ -1,25 +1,27 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Regularizer regularizer
=========== ===========
WeightDecayRegularizer append_regularization_ops
---------------------- -------------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer .. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex: :noindex:
L1Decay
-------
.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
:members:
:noindex:
L1DecayRegularizer L2Decay
------------------- -------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
:members:
:noindex:

@ -0,0 +1,96 @@
# Design Doc: CSP in PaddlePaddle Fluid
## Motivation
Concurrent programming is important for deep learning. Few example applications are:
1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
## Concurrent Programming Models
There were many concurrent programming models, implemented in various forms:
| concurrent programming model | implementation |
|-----|-----|
| mutex | types and functions in standard libraries |
| semaphore | types and functions in standard libraries |
| communicating sequential processes (CSP) | Go programming language |
| actor model | Erlang programming language |
| message passing | MPI |
| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
### CSP v.s. Actor Model
A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
## CSP in Fluid
Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
1. a new data type: *channel* and operators *send* and *recv*,
1. *goroutine* or thread, and
1. a new control-flow: select.
We also need Python wrappers for the above components.
The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
### Type Channel
Fluid supports many data types:
1. Tensor,
1. Row-sparse Tensor
1. LoD Tensor,
1. Tensor array, etc
Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
## Syntax Design
### Create Channel
In Go, we create a channel by specifying the element type and buffer size:
```go
ch := make(chan int) // a channel without buffer
ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
```
In Fluid, we should be able to do the same:
```python
ch = fluid.make_chan(dtype=INT)
ch1 = fluid.make_chan(dtype=INT, 100)
```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python
ch = fluid.make_chan(dtype=Tensor, etype=float16)
```
or Tensors of Tensors of float16 etc.
The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
### Send and Recv
### Select
## Example Programs
### 1. RPC between Trainers and Parameter Servers
### 2. Concurrent Minibatch Loading

@ -152,12 +152,12 @@ for data in train_reader():
`JobDesc` object describe the distributed job resource specification to run on `JobDesc` object describe the distributed job resource specification to run on
Cluster environment. Cluster environment.
<img src="src/remote_executor.png"/> <img src="src/remote_executor.png" width="500" align="center" />
`RemoteExecutor.run` sends the `ProgramDesc` and `RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc`. to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
### Placement Algorithm ### Placement Algorithm

@ -9,16 +9,16 @@ different purposes.
## Background ## Background
The previous implementations of the parameter server does not run a The previous implementations of the parameter server do not run a
fluid sub-program. Parameter initialization, optimizer computation, network fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the communication and checkpointing are implemented twice on both the
trainer and the parameter server. trainer as well as the parameter server.
It would be great if we can write code once and use them on both the It would be great if we can write code once and use them on both: the
trainer and the parameter server: reduces code duplication and trainer and the parameter server, since this reduces code duplication and
improves extensibility. Given that after the current refactor, we are improves extensibility. Given that after the current refactoring, we are
representing everything as a computing graph on the representing everything as a computation graph on the
trainer. Representing everything as a computing graph on the parameter trainer. Representing everything as a computation graph on the parameter
server becomes a natural extension. server becomes a natural extension.
## Design ## Design
@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following
steps: steps:
1. OP placement: the OPs will be placed on different nodes according 1. OP placement: the OPs will be placed on different nodes according
to heuristic that minimizes estimated total computation to a heuristic that minimizes the estimated total computation
time. Currently we will use a simple heuristic that puts parameter time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer variable on parameter server workers and everything else on trainer
workers. workers.
1. Add communication OPs to enable the communication between nodes. 1. Add communication OPs to enable the communication between nodes.
@ -47,22 +47,22 @@ After converting:
<img src="src/dist-graph.png" width="700"/> <img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer program are placed on the parameter server. 1. The parameter variable W and its optimizer program are placed on the parameter server.
1. Operators are added to the program. 1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The - *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark to run when the *Send* operator has ran (the *Send* OP will mark
the *Recv* OP runnable automatically). the *Recv* OP runnable automatically).
- *Enueue* enqueues the input variable, it can block until space - *Enqueue* enqueues the input variable, it can block until space
become available in the queue. become available in the queue.
- *Dequeue* outputs configurable numbers of tensors from the - *Dequeue* outputs configurable numbers of tensors from the
queue. It will block until the queue have the required number of queue. It will block until the queue has the required number of
tensors. tensors.
### Benefits ### Benefits
- Model parallelism become easier to implement: it's an extension to - Model parallelism becomes easier to implement: it is an extension to
the trainer - parameter server approach. We can have several "Transpilers" the trainer - parameter server approach. We can have several "Transpilers"
to achieve different goals. to achieve different goals.
- User-defined optimizer is easier to add - user can now express it as - User-defined optimizer is easier to add - user can now express it as
@ -72,22 +72,22 @@ After converting:
### Challenges ### Challenges
- It's important to balance the parameter shards of on multiple - It is important to balance the parameter shards on multiple
parameter server. If a single parameter is very big (some parameter servers. If a single parameter is very big (for example: some
word-embedding, fully connected, softmax layer), we need to word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends parameter servers when possible (only element-wise optimizer depends
on the parameter variable). on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server - In the "Async SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently. See could be read and written concurrently. See
[here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
details about concurrent program in fluid. details about concurrent program in Fluid.
### Discussion ### Discussion
- Can the Enqueue OP be implemented under our current tensor design - Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)? (put the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the - *Dequeue* OP will have variable numbers of output (depending on the
`min_count` attribute), does our current design support it? (similar `min_count` attribute), does our current design support it? (similar
question for the *Add* OP) question for the *Add* OP)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

After

Width:  |  Height:  |  Size: 118 KiB

@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clarity. let's call this format the **absolute-offset LoD** for clarity.
The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python ```python
[[0, 3, 9] [[0, 3, 9]
[0, 2, 3, 3, 3, 9]] [0, 2, 3, 3, 3, 9]]
@ -119,7 +119,7 @@ def generate():
encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
decoder_input = pd.fc( decoder_input = pd.fc(
act=pd.activation.Linear(), act=pd.activation.Linear(),
input=[target_word, encoder_ctx], input=[target_word, encoder_ctx_expanded],
size=3 * decoder_dim) size=3 * decoder_dim)
gru_out, cur_mem = pd.gru_step( gru_out, cur_mem = pd.gru_step(
decoder_input, mem=decoder_mem, size=decoder_dim) decoder_input, mem=decoder_mem, size=decoder_dim)

@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM ### Beam Search with CTC and LM
TODO by Assignees <div align="center">
<img src="image/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder.
</div>
- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
- Such external scorer consists of language model, word count or any other custom scorers.
- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
## Future Work ## Future Work
@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save