Merge branch 'develop' into core_add_sequence_softmax_op

tonyyang-svail-feed-op-desgin
Liu Yiqun 7 years ago
commit c8fc6037fd

@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
def process(settings, file_list):
for i in xrange(1024):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
lab = random.randint(0, settings.num_class)
lab = random.randint(0, settings.num_class - 1)
yield img.astype('float32'), int(lab)

@ -0,0 +1,48 @@
set -e
unset OMP_NUM_THREADS MKL_NUM_THREADS
export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0"
function train() {
topology=$1
bs=$2
use_mkldnn=$3
if [ $3 == "True" ]; then
thread=1
log="logs/${topology}-mkldnn-${bs}.log"
elif [ $3 == "False" ]; then
thread=`nproc`
log="logs/${topology}-${thread}mklml-${bs}.log"
else
echo "Wrong input $3, use True or False."
fi
args="batch_size=${bs}"
config="${topology}.py"
paddle train --job=time \
--config=$config \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--config_args=$args \
2>&1 | tee ${log}
}
if [ ! -d "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
#========== mkldnn ==========#
train vgg 64 True
train vgg 128 True
train vgg 256 True
#========== mklml ===========#
train vgg 64 False
train vgg 128 False
train vgg 256 False

@ -0,0 +1,103 @@
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
img = data_layer(name='image', size=height * width * 3)
def vgg_network(vgg_num=3):
tmp = img_conv_group(
input=img,
num_channels=3,
conv_padding=1,
conv_num_filter=[64, 64],
conv_filter_size=3,
conv_act=ReluActivation(),
pool_size=2,
pool_stride=2,
pool_type=MaxPooling())
tmp = img_conv_group(
input=tmp,
conv_num_filter=[128, 128],
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(256)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(512)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
if layer_num == 16:
vgg = vgg_network(3)
elif layer_num == 19:
vgg = vgg_network(4)
else:
print("Wrong layer number.")
lab = data_layer('label', num_class)
loss = cross_entropy(input=vgg, label=lab)
outputs(loss)

@ -253,7 +253,7 @@ function(nv_library TARGET_NAME)
foreach(source_file ${nv_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})

@ -97,6 +97,10 @@ function(link_paddle_exe TARGET_NAME)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
endfunction()

@ -0,0 +1,222 @@
# Design Doc: Distributed Training Architecture
## Abstract
PaddlePaddle v0.10.0 uses the "trainer-parameter server"
architecture. We run multiple replicated instances of trainers (runs
the same code written by the user) and parameter servers for
distributed training. This architecture served us well, but has some
limitations:
1. Need to write special code to handle tasks which should only be run
by a single trainer. E.g., initializing model and saving model.
2. Model parallelism is hard: need to write if-else branches conditioned
on the trainer ID to partition model onto each trainer, and manually
write the inter-model-shard communication code.
3. The user can not directly specify the parameter update rule: need
to modify the parameter server C++ code and compile a new
binary. This adds complication for researchers: A lot of extra
effort is required. Besides, the training job submission program
may not allow running arbitrary binaries.
This design doc discusses PaddlePaddle's new distributed training
architecture that addresses the above limitations.
## Analysis
We will assume the user writes the trainer program by Python, the same
analysis holds if the trainer program is written in C++.
### Limitation 1
If we look at the Python code that the user writes, there are two
kinds of functionalities:
- The training logic such as load / save model and print log.
- The neural network definition such as the definition of the data
layer, the fully connected layer, the cost function and the
optimizer.
When we training with PaddlePaddle v0.10.0 distributedly, multiple
replicated Python instances are running on different nodes: both the
training logic and the neural network computation is replicated.
The tasks that should only run once all belong to the training logic,
if we only replicate the neural network computation, but do **not**
replicate the training logic, the limitation could be solved.
### Limitation 2
Model parallelism means running a single model on multiple nodes by
partitioning the model onto different nodes and managing the
inter-model-shard communications.
PaddlePaddle should be able to modify the nerual network computation
definition to support model parallelism automatically. However, the
computation is only specified in Python code, and PaddlePaddle can not
modify Python code.
Just like compiler uses a intermediate representation (IR) so that
programmer does not need to manually optimize their code in most of
the cases - the compiler will optimize the IR:
<img src="src/compiler.png"/>
We can have our own IR too: PaddlePaddle can support model parallel by
converting the IR so the user no longer need to manually do it in
Python:
<img src="src/paddle-compile.png"/>
The IR for PaddlePaddle after refactor is called `Block`, it specifies
the computation dependency graph and the variables used in the
computation.
### Limitation 3
The user can not directly specify the parameter update rule for the
parameter server because the parameter server does not use the same
computation definition as the trainer. Instead, the update rule is
baked in the parameter server. The user can not specify the update
rule in the same way of specifying the trainer computation.
This could be fixed by making the parameter server run the same
computation definition as the trainer. For a detailed explanation,
please
see
[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
## Distributed Training Architecture
The new distributed training architecture can address the above
limitations. Below is the illustration:
<img src="src/distributed_architecture.png"/>
The architecture includes major components: *PaddlePaddle Python*,
*PaddlePaddle converter* and *PaddlePaddle runtime*:
### PaddlePaddle Python
PaddlePaddle Python is the Python library that user's Python trainer
invoke to build the neural network topology, start training, etc.
```Python
paddle.init()
input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
img, label = input[0], input[1]
hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(input=prediction, label=label)
optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
for i in range(1000):
_, cost_val = session.eval(targets=[cost, optimizer])
print cost_val
```
The code above is a typical Python trainer code, the neural network
topology is built using helper functions such as
`paddle.layer.fc`. The training is done by calling `session.eval`
iteratively.
#### session.eval
As shown in the graph, `session.eval` sends the IR and the evaluation
inputs/targets to the PaddlePaddle cluster for evaluation. The
targets can be any variable in the computation graph. When the target
is the `optimizer` variable, the neural network will be optimized
once. When the target is the `cost` variable, `session.eval` returns
the cost value.
The Python `session` is a wrapper of the C++ `Session` class. For more
information about `Session`, please
see [Design Doc: Session](./session.md).
### PaddlePaddle Converter
PaddlePaddle converter automatically converts the IR in the request
(IR and evaluation inputs/targets) from PaddlePaddle Python to new
partitioned IRs and dispatch the new IRs and evaluation inputs/targets
to different PaddlePaddle runtimes. Below are the steps:
1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
fetches the eval targets to the IR.
1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
the boundary. The runtime does not need to run the OP that is not
dependent by the `fetch` OP.
1. Optimizes the computation graph.
1. Place the OPs in the graph onto different devices on different
PaddlePaddle runtime according to a placement algorithm and device
constraint specified by the user.
1. Partition the graph according to runtime boundaries and add `send` /
`recv` OP pair on the runtime boundaries.
1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
results back to the converter, the convert reports the evaluation
results back to the PaddlePaddle Python.
The output IRs will be cached to optimize the conversion latency.
#### Placement Algorithm
Our first implementation will only support "trainer-parameter server"
placement: the parameters, initializers, and optimizers are placed on
the PaddlePaddle runtimes with the parameter server role. And
everything else will be placed on the PaddlePaddle runtimes with the
trainer role. This has the same functionality of our
"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
is more general and flexible.
In the future, we will implement the general placement algorithm,
which makes placements according to the input IR, and a model of
device computation time and device communication time. Model
parallelism requires the general placement algorithm.
### PaddlePaddle Runtime
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
runs the IR. The runtime does not need to do OP placement since it's
already done by the converter.
### Local Training Architecture
The local training architecture will be the same as the distributed
training architecture, the differences are everything runs locally,
and there is just one PaddlePaddle runtime:
<img src="src/local_architecture.png"/>
### Training Data
In PaddlePaddle v0.10.0, training data is typically read
with [data reader](../reader/README.md) from Python. This approach is
no longer efficient when training distributedly since the Python
process no longer runs on the same node with the trainer processes,
the Python reader will need to read from the distributed filesystem
(assuming it has the access) and send to the trainers, doubling the
network traffic.
When doing distributed training, the user can still use Python data
reader: the training data are sent with `session.eval`. However should
be used for debugging purpose only. The users are encouraged to use
the read data OPs.
## References:
[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Before

Width:  |  Height:  |  Size: 222 KiB

After

Width:  |  Height:  |  Size: 222 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Before

Width:  |  Height:  |  Size: 28 KiB

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

@ -247,11 +247,11 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID相同名字
CMake Warning at cmake/version.cmake:20 (message):
Cannot add paddle version from git tag
那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`然后重新cmake即可。
12. A protocol message was rejected because it was too big
----------------------------------------------------------
------------------------------------------------------------
如果在训练NLP相关模型时出现以下错误
@ -316,10 +316,42 @@ Paddle二进制在运行时捕获了浮点数异常只要出现浮点数异
* 模型一直不收敛,发散到了一个数值特别大的地方。
* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。
主要的解决办法是减小学习率或者对数据进行归一化处理。
这里有两种有效的解决方法:
1. 设置 :code:`gradient_clipping_threshold` 参数,示例代码如下:
.. code-block:: python
optimizer = paddle.optimizer.RMSProp(
learning_rate=1e-3,
gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
具体可以参考 `nmt_without_attention <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
2. 设置 :code:`error_clipping_threshold` 参数,示例代码如下:
.. code-block:: python
decoder_inputs = paddle.layer.fc(
act=paddle.activation.Linear(),
size=decoder_size * 3,
bias_attr=False,
input=[context, current_word],
layer_attr=paddle.attr.ExtraLayerAttribute(
error_clipping_threshold=100.0))
完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_
两种方法的区别:
1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用;
2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度;
除此之外,还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
------------------------------------------------------------------------
------------------------------------------------------------------------------------------
先查看一下是否曾经安装过paddle v1版本有的话需要先卸载
pip uninstall py_paddle paddle
@ -329,7 +361,7 @@ pip uninstall py_paddle paddle
pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
16. PaddlePaddle存储的参数格式是什么如何和明文进行相互转化
---------------------------------------------------------
---------------------------------------------------------------------
PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中1~4字节表示PaddlePaddle版本信息请直接填充05~8字节表示每个参数占用的字节数当保存的网络参数为float类型时为4double类型时为89~16字节表示保存的参数总个数。
@ -381,7 +413,7 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
18. 集群多节点训练,日志中保存均为网络通信类错误
------------------------------
-----------------------------------------------------------
集群多节点训练,日志报错为网络通信类错误,比如 :code:`Connection reset by peer` 等。
此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出,从而引发其他节点无法连接导致,可以参考下面的步骤排查:
@ -390,4 +422,171 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
* 如果发现最早的报错就是网络通信的问题很有可能是非独占方式执行导致的端口冲突可以联系OP看当前MPI集群是否支持resource=full参数提交如果支持增加此参数提交并更换job 端口。
* 如果当前MPI集群并不支持任务独占模式可以联系OP是否可以更换集群或升级当前集群。
* 如果当前MPI集群并不支持任务独占模式可以联系OP是否可以更换集群或升级当前集群。
19. 如何调用 infer 接口输出多个layer的预测结果
-----------------------------------------------------------
* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入,代码如下:
.. code-block:: python
inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例,代码如下:
.. code-block:: python
out = inferer.infer(input=data_batch, field=["value"])
需要注意的是:
* 如果指定了2个layer作为输出层实际上需要的输出结果是两个矩阵
* 假设第一个layer的输出A是一个 N1 * M1 的矩阵,第二个 Layer 的输出B是一个 N2 * M2 的矩阵;
* paddle.v2 默认会将A和B 横向拼接当N1 和 N2 大小不一样时,会报如下的错误:
.. code-block:: python
ValueError: all the input array dimensions except for the concatenation axis must match exactly
多个层的输出矩阵的高度不一致导致拼接失败,这种情况常常发生在:
* 同时输出序列层和非序列层;
* 多个输出层处理多个不同长度的序列;
此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤来解决上面的问题。这时infer接口的返回值是一个python list:
* list 中元素的个数等于网络中输出层的个数;
* list 中每个元素是一个layer的输出结果矩阵类型是numpy的ndarray
* 每一个layer输出矩阵的高度在非序列输入时等于样本数序列输入时等于输入序列中元素的总数宽度等于配置中layer的size
20. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
-------------------------------------------------------------
* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出该layer是通过参数 :code:`name` 指定,即,:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer并将该layer上一时间步的输出作为自身当前时间步的输出。
* PaddlePaddle的所有layer都有唯一的name用户通过参数 :code:`name` 设定当用户没有显式设定时PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer其name由参数 :code:`memory_name` 设定当用户没有显式设定时PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer需要用户显式设定。
21. 两种使用 drop_out 的方法有何区别?
-----------------------------------------------------
* 在PaddlePaddle中使用dropout有两种方式
* 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`,以 :code:`paddle.layer.fc` 为例,代码如下:
.. code-block:: python
fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
* 使用 :code:`paddle.layer.dropout`,以 :code:`paddle.layer.fc` 为例,代码如下:
.. code-block:: python
fc = paddle.layer.fc(input=input)
drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
* PaddlePaddle在激活函数里实现dropout而不是在layer里实现。
* :code:`paddle.layer.lstmemory`:code:`paddle.layer.grumemory`:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout可采用第二种方式即使用 :code:`paddle.layer.dropout`
22. 如何设置学习率退火learning rate annealing
------------------------------------------------
在相应的优化算法里设置learning_rate_schedule及相关参数以使用Adam算法为例代码如下
.. code-block:: python
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
learning_rate_decay_a=0.5,
learning_rate_decay_b=0.75,
learning_rate_schedule="poly",)
PaddlePaddle目前支持8种learning_rate_schedule这8种learning_rate_schedule及其对应学习率计算方式如下
* "constant"
lr = learning_rate
* "poly"
lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
其中num_samples_processed为已训练样本数下同。
* "caffe_poly"
lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
* "exp"
lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
* "discexp"
lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
* "linear"
lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
* "manual"
这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例代码如下
.. code-block:: python
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
learning_rate_schedule="manual",
learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
在该示例中当已训练样本数小于等于1000时学习率为 :code:`1e-3 * 1.0`当已训练样本数大于1000小于等于2000时学习率为 :code:`1e-3 * 0.9`当已训练样本数大于2000时学习率为 :code:`1e-3 * 0.8`
* "pass_manual"
这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例代码如下
.. code-block:: python
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
learning_rate_schedule="manual",
learning_rate_args="1:1.0,2:0.9,3:0.8",)
在该示例中当已训练pass数小于等于1时学习率为 :code:`1e-3 * 1.0`当已训练pass数大于1小于等于2时学习率为 :code:`1e-3 * 0.9`当已训练pass数大于2时学习率为 :code:`1e-3 * 0.8`
23. 出现 :code:`Duplicated layer name` 错误怎么办
--------------------------------------------------
出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时,先找出参数 :code:`name` 取值相同的layer然后将这些layer的参数 :code:`name` 设置为不同的值。
24. PaddlePaddle 中不同的 recurrent layer 的区别
--------------------------------------------------
以LSTM为例在PaddlePaddle中包含以下 recurrent layer
* :code:`paddle.layer.lstmemory`
* :code:`paddle.networks.simple_lstm`
* :code:`paddle.networks.lstmemory_group`
* :code:`paddle.networks.bidirectional_lstm`
按照具体实现方式可以归纳为2类
1. 由 recurrent_group 实现的 recurrent layer
* 用户在使用这一类recurrent layer时可以访问由recurrent unit在一个时间步内计算得到的中间值例如hidden states, memory cells等
* 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer
2. 将recurrent layer作为一个整体来实现
* 用户在使用这一类recurrent layer只能访问它们的输出值
* 上述的 :code:`paddle.networks.lstmemory_group`:code:`paddle.networks.simple_lstm`:code:`paddle.networks.bidirectional_lstm` 属于这一类的实现;
将recurrent layer作为一个整体来实现 能够针对CPU和GPU的计算做更多优化 所以相比于recurrent group的实现方式 第二类 recurrent layer 计算效率更高。 在实际应用中如果用户不需要访问LSTM的中间变量而只需要获得recurrent layer计算的输出我们建议使用第二类实现。
此外关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元:
* 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程它并不是一个完整的recurrent layer也不能接收序列数据作为输入
* :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用

@ -20,7 +20,7 @@ Docker使用入门
docker pull paddlepaddle/paddle:0.10.0
来下载Docker镜像paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
来下载Docker镜像paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的推荐国内用户使用docker.paddlepaddle.org/paddle下载。
- *容器* 如果说一个Docker镜像就是一个程序那容器就是这个程序运行时产生的“进程”。
实际上,一个容器就是一个操作系统的进程,但是是运行在独立的进程空间,文件系统以及网络之上。

File diff suppressed because it is too large Load Diff

@ -26,7 +26,7 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker)
cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)

@ -28,47 +28,6 @@ ProgramDesc& GetProgramDesc() {
return *g_program_desc;
}
template <>
AttrType AttrTypeID<bool>() {
return BOOLEAN;
}
template <>
AttrType AttrTypeID<int>() {
return INT;
}
template <>
AttrType AttrTypeID<float>() {
return FLOAT;
}
template <>
AttrType AttrTypeID<std::string>() {
return STRING;
}
template <>
AttrType AttrTypeID<std::vector<bool>>() {
return BOOLEANS;
}
template <>
AttrType AttrTypeID<std::vector<int>>() {
return INTS;
}
template <>
AttrType AttrTypeID<std::vector<float>>() {
return FLOATS;
}
template <>
AttrType AttrTypeID<std::vector<std::string>>() {
return STRINGS;
}
template <>
AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
return INT_PAIRS;
}
template <>
AttrType AttrTypeID<BlockDesc>() {
return BLOCK;
}
Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
switch (attr_desc.type()) {
case framework::AttrType::BOOLEAN: {
@ -111,14 +70,6 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
}
return val;
}
case framework::AttrType::INT_PAIRS: {
std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
val[i].first = attr_desc.int_pairs(i).first();
val[i].second = attr_desc.int_pairs(i).second();
}
return val;
}
case framework::AttrType::BLOCK: {
return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
}

@ -27,10 +27,10 @@ limitations under the License. */
namespace paddle {
namespace framework {
typedef boost::variant<boost::blank, bool, int, float, std::string,
std::vector<bool>, std::vector<int>, std::vector<float>,
std::vector<std::string>,
std::vector<std::pair<int, int>>, BlockDesc*>
// The order should be as same as framework.proto
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>, bool,
std::vector<bool>, BlockDesc*>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
@ -38,10 +38,28 @@ typedef std::unordered_map<std::string, Attribute> AttributeMap;
ProgramDesc& GetProgramDesc();
template <typename T>
AttrType AttrTypeID();
inline AttrType AttrTypeID() {
Attribute tmp = T();
return static_cast<AttrType>(tmp.which() - 1);
}
Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
class AttrReader {
public:
explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {}
template <typename T>
inline const T& Get(const std::string& name) const {
PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
name);
return boost::get<T>(attrs_.at(name));
}
private:
const AttributeMap& attrs_;
};
// check whether a value(attribute) fit a certain limit
template <typename T>
class GreaterThanChecker {

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save