Merge branch 'develop' into release/0.10.0

feature/design_of_v2_layer_converter
Luo Tao 8 years ago
commit c810e6a7c0

@ -0,0 +1,28 @@
| Github account | name |
|---|---|
| reyoung | Yang Yu |
| gangliao | Gang Liao |
| luotao01 | Tao Luo |
| jacquesqiao | Long-Fei Qiao |
| qingqing01 | Qing-Qing Dang |
| hedaoyuan | Dao-Yuan He |
| wangyang59 | Yang Wang |
| QiJune | Jun Qi |
| tianbingsz | Tian-Bing Xu |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| typhoonzero | Yi Wu |
| backyes | Yan-Fei Wang |
| pengli09 | Peng Li |
| livc | Zhao Li |
| Xreki | Yi-Qun Liu |
| Yancey1989 | Xu Yan |
| emailweixu | Wei Xu |
| wen-bo-yang | Wen-Bo Yang |
| helinwang | He-Lin Wang |
| lcy-seso | Ying Cao |
| Zrachel | Rui-Qing Zhang |
| Haichao-Zhang | Hai-Chao Zhang |
| gongweibao | Wei-Bao Gong |
| lzhao4ever | Liang Zhao |
| zhouxiao-coder | Xiao Zhou |
| lipeng-unisound | Peng Li |

@ -1,19 +1,19 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
include(system)
@ -50,6 +50,7 @@ option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
@ -75,6 +76,13 @@ endif(ANDROID)
set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
if (WITH_C_API AND WITH_PYTHON)
message(WARNING "It is suggest not embedded a python interpreter in Paddle "
"when using C-API. It will give an unpredictable behavior when using a "
"different Python interpreter from compiling.")
endif()
########################################################################################
include(external/zlib) # download, build, install zlib

@ -46,6 +46,11 @@ RUN pip install --upgrade pip && \
pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev
RUN pip install certifi urllib3[secure]
RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
cd .. && rm -rf cmake-3.4.1

@ -1,8 +1,6 @@
# Release v0.10.0
We are glad to release version 0.10.0. In this version, we are happy to
release the
new
We are glad to release version 0.10.0. In this version, we are happy to release the new
[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
- Our old Python API is kind of out of date. It's hard to learn and hard to

@ -1,56 +0,0 @@
Cao, Ying
Cheng, Yujuan
Dang, Qingqing
Dong, Tengfei
Du, Dalong
Feng, Shouqiang
Gao, Haoyuan
Han, Baochang
Han, Jinchen
Hao, Nanyu
He, Daoyuan
He, Zhengyan
Hou, Jue
Huang, Chang
Huang, Zhiheng
Hu, Na
Kong, Qi
Liao, Gang
Li, Bo
Li, Jiajie
Li, Jing
Li, Lei
Li, Peng
Liu, Sheng
Liu, Yuan
Li, Yuze
Luo, Heng
Luo, Tao
Lyu, Qin
Mao, Hongyue
Qian, Xiaojun
Qiao, Longfei
Qi, Jun
Qin, Duohao
Shen, Guolong
Shi, Guangchuan
Song, Xiang
Wang, Helin
Wang, Jiang
Wang, Yanfei
Wang, Yi
Wang, Yong
Weng, Renliang
Xu, Tianbing
Xu, Wei
Xu, Xingyu
Yan, Chong
Yan, Chunwei
Yang, Yi
Yu, Yang
Yu, Yinan
Zhang, Jian
Zhang, Ruiqing
Zhang, Weide
Zhao, Liang
Zhou, Jie

@ -5,7 +5,7 @@
# If any cblas implementation found, the following variable will be set.
# CBLAS_PROVIDER # one of MKL, ATLAS, OPENBLAS, REFERENCE
# CBLAS_INC_DIR # the include directory for cblas.
# CBLAS_LIBS # a list of libraries should be linked by paddle.
# CBLAS_LIBS # a list of libraries should be linked by paddle.
# # Each library should be full path to object file.
#
# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
@ -63,11 +63,11 @@ set(ATLAS_LIB_SEARCH_PATHS
/usr/lib/atlas
/usr/lib/atlas-base # special for ubuntu 14.04.
)
find_path(ATLAS_INC_DIR NAMES cblas.h
find_path(ATLAS_INC_DIR NAMES cblas.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
@ -76,11 +76,12 @@ if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
set(CBLAS_PROVIDER ATLAS)
set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
add_definitions(-DPADDLE_USE_ATLAS)
add_definitions(-DPADDLE_USE_ATLAS)
message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON)
if(ATLAS_CLAPACK_INC_DIR)
add_definitions(-DPADDLE_USE_LAPACK)
set(CBLAS_INC_DIR ${CBLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
endif()
return()
@ -124,7 +125,7 @@ endif()
## Then find the reference-cblas. www.netlib.org/blas/
set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
"Folder contains reference-cblas")
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/include

@ -34,7 +34,7 @@ set(IGNORE_PATTERN
#
# first argument: target name to attach
# rest arguments: source list to check code style.
#
#
# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
macro(add_style_check_target TARGET_NAME)
if(WITH_STYLE_CHECK)
@ -48,13 +48,17 @@ macro(add_style_check_target TARGET_NAME)
if(filename MATCHES ${pattern})
message(STATUS "DROP LINT ${filename}")
set(LINT OFF)
endif()
endif()
endforeach()
if(LINT MATCHES ON)
add_custom_command(TARGET ${TARGET_NAME}
get_filename_component(base_filename ${filename} NAME)
set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
add_custom_command(OUTPUT ${CUR_GEN}
PRE_BUILD
COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}" ${filename}
"--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename}
DEPENDS ${filename}
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
endif()
endforeach()

@ -1,11 +1,11 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -20,7 +20,7 @@ FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
IF(PROTOBUF_FOUND)
EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
IF (${PROTOBUF_VERSION} VERSION_LESS "3.1.0")
IF ("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
SET(PROTOBUF_FOUND OFF)
ENDIF()
ENDIF(PROTOBUF_FOUND)

@ -197,3 +197,4 @@ if(CUDA_ARCH)
endif()
set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})

@ -28,6 +28,11 @@ ELSE(WIN32)
STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
SET(MACOS_VERSION ${VERSION})
SET(HOST_SYSTEM "macosx")
IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
# Set cache variable - end user may change this during ccmake or cmake-gui configure.
SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
"Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
ENDIF()
ELSE(APPLE)
IF(EXISTS "/etc/issue")

@ -1,27 +1,40 @@
import gzip
import math
import paddle.v2 as paddle
dictsize = 1953
embsize = 32
hiddensize = 256
N = 5
def wordemb(inlayer):
wordemb = paddle.layer.table_projection(
wordemb = paddle.layer.embedding(
input=inlayer,
size=embsize,
param_attr=paddle.attr.Param(
name="_proj",
initial_std=0.001,
learning_rate=1,
l2_rate=0, ))
l2_rate=0,
sparse_update=True))
return wordemb
def main():
paddle.init(use_gpu=False, trainer_count=1)
# for local training
cluster_train = False
if not cluster_train:
paddle.init(use_gpu=False, trainer_count=1)
else:
paddle.init(
use_gpu=False,
trainer_count=2,
port=7164,
ports_num=1,
ports_num_for_sparse=1,
num_gradient_servers=1)
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
firstword = paddle.layer.data(
@ -57,6 +70,9 @@ def main():
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
'w') as f:
trainer.save_parameter_to_tar(f)
result = trainer.test(
paddle.batch(
paddle.dataset.imikolov.test(word_dict, N), 32))
@ -65,11 +81,15 @@ def main():
result.metrics)
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.Adam(
adagrad = paddle.optimizer.AdaGrad(
learning_rate=3e-3,
regularization=paddle.optimizer.L2Regularization(8e-4))
trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
trainer = paddle.trainer.SGD(cost,
parameters,
adagrad,
is_local=not cluster_train)
trainer.train(
paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
num_passes=30,

@ -498,6 +498,12 @@ hsigmoid
:members: hsigmoid
:noindex:
smooth_l1_cost
--------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: smooth_l1_cost
:noindex:
Check Layer
============

@ -419,6 +419,11 @@ hsigmoid
.. autoclass:: paddle.v2.layer.hsigmoid
:noindex:
smooth_l1_cost
--------------
.. autoclass:: paddle.v2.layer.smooth_l1_cost
:noindex:
Check Layer
============

@ -17,12 +17,16 @@ A training job will be created once user asks Paddle cloud to train a model. The
1. the *master process*, which dispatches tasks to
1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
1. one or more *parameter server processes*, where each holds a shard of the global model.
1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters.
Their relation is illustrated in the following graph:
<img src="src/paddle-model-sharding.png"/>
By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies.
When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model.
### Master Process
The master process will:
@ -31,7 +35,7 @@ The master process will:
- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
#### Task
#### Task
A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
@ -78,7 +82,7 @@ The communication pattern between the trainers and the parameter servers depends
- Synchronous Stochastic Gradient Descent (sync-SGD)
Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
- Asynchronous Stochastic Gradient Descent (async-SGD)
There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
@ -118,8 +122,6 @@ When the master is started by the Kubernetes, it executes the following steps at
1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
The master process will kill itself if its etcd lease expires.
When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
### Trainer Process
@ -132,6 +134,8 @@ When the trainer is started by the Kubernetes, it executes the following steps a
If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.
When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
### Parameter Server Process
When the parameter server is started by Kubernetes, it executes the following steps at startup:
@ -140,11 +144,11 @@ When the parameter server is started by Kubernetes, it executes the following st
1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
The desired number of parameter servers is 3:
<img src="src/paddle-ps-0.png"/>
The third parameter server joined:
<img src="src/paddle-ps-1.png"/>
1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
@ -153,6 +157,13 @@ When the parameter server is started by Kubernetes, it executes the following st
If the parameter server's etcd lease expires, the parameter server will kill itself.
## Parameter Server Checkpointing
See [here](./checkpointing.md)
## Store and dispatching trainning data
See [here](./data_dispatch.md)
## Dynamic Scaling
### Trainer Scaling

@ -0,0 +1,44 @@
## 模型参数检查点Checkpointing
模型数据检查点的实现可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中可以通过阶段性的保存每个parameter server的数据快照snapshot***分布式存储服务*** 达到容灾的目的比如每隔10分钟最新的快照并删除更早的快照。在出现单点故障时只需要恢复这台节点或者将这台节点迁移到另一个节点并启动即可恢复训练任务。
<img src="src/checkpointing.png" width="500"/>
### 快照保存的设计如下:
说明:
* parameter server在集群中启动后自动挂载分布式存储目录并把快照保存到这个目录下。
* ***注每个parameter server的检查点各自独立保存暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点因为这样做也没法保证消除随机性。***
检查点保存程序流程:
1. 如果满足条件"每隔10分钟"时parameter server会获取parameters内存的`read_lock`启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`所以在写入快照的过程中parameter server会暂停参数更新并等待。
2. parameter server生成一个UUID向指定的目录中一个新的文件文件名为此UUID写入快照数据。在快照写入完成后计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。
3. 删除磁盘目录中不是当前uuid的快照文件。
4. 释放对paramters内存的锁定停止保存检查点的线程。
这里需要用户额外注意在您的实际环境中训练任务的运行可能会占满trainer和parameter server之间的网络带宽如果parameter server此时还需要通过网络访问分布式存储以保存快照可能会造成网络拥塞而出现阶段性的运行停滞。
### 从快照恢复
在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动则需要回滚到上一个检查点
1. 从etcd中读取节点`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid
1. 从磁盘文件中加载uuid文件名的检查点快照文件并加载其中的参数
1. 如果上面两步出现错误,则使用启动参数定义的初始化方法初始化参数
1. 开始提供服务
## TODO List
### 推测执行/加速执行TODO
在异构集群中如果存在某些trainer执行速度过慢会影响整体集群的速度如图中Trainer 1此时master将负责启动一个新的TrainerAccelerate Trainer 2使用同样的训练数据block。哪个trainer先完成block的训练则把另一个慢速的kill掉。
### 动态扩容/缩容
目前只考虑动态扩容trainer数量可以减小系统复杂性。
## 术语
* model: 指深度学习训练之后得到的所有参数,使用这个神经网络可以完成对新数据的预测
* parameters: 神经网络中的参数包括权重w和偏置b。一个神经网络的模型由大量的参数组成
* shard: 分片,通常指将一个整体拆分成多份的其中的一份。
* model shard: 将一个神经网络参数拆分成多份每个shard分别存储在其中一台parameter server之上
* parameter block: 多个parameter block构成一个model shard
* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低((平均故障率*平均故障修复时间)^2只对特殊在线系统考虑两台以上同时故障的容灾。

@ -0,0 +1,120 @@
## 训练数据的存储和分发
### 流程介绍
生产环境中的训练数据集通常体积很大并被存储在诸如Hadoop HDFSCephAWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务包括
* 数据预处理任务
* Paddle训练任务
* 在线模型预测服务
<img src="src/paddle-cloud-in-data-center.png" width="500"/>
在上图中显示了在一个实际生产环境中的应用人脸识别的数据流图。生产环境的日志数据会通过实时流的方式Kafka和离线数据的方式HDFS存储并在集群中运行多个分布式数据处理任务比如流式数据处理online data process离线批处理offline data process完成数据的预处理提供给paddle作为训练数据。用于也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。
### 训练数据的存储
选择CephFS作为训练数据的存储服务。
在Kubernetes上运行的不同的计算框架可以通过Volume或PersistentVolume挂载存储空间到每个容器中。
在CephFS存储系统中的公开目录需要保存一些预置的公开数据集比如MNIST, BOW, ImageNet数据集等并且可以被提交的job直接使用。
### 文件预处理
在数据集可以被训练之前文件需要预先被转换成PaddlePaddle集群内部的存储格式SSTable。我们提供两个转换方式
- 提供给用户本地转换的库,用户可以编写程序完成转换。
- 用户可以上传自己的数据集在集群运行MapReduce job完成转换。
转换生成的文件名会是以下格式:
```text
name_prefix-aaaaa-of-bbbbb
```
"aaaaa"和"bbbbb"都是五位的数字每一个文件是数据集的一个shard"aaaaa"代表shard的index"bbbbb"代表这个shard的最大index。
比如ImageNet这个数据集可能被分成1000个shard它们的文件名是
```text
imagenet-00000-of-00999
imagenet-00001-of-00999
...
imagenet-00999-of-00999
```
#### 转换库
无论是在本地或是云端转换我们都提供Python的转换库接口是
```python
def convert(output_path, reader, num_shards, name_prefix)
```
- `output_path`: directory in which output files will be saved.
- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances.
- `num_shards`: the number of shards that the dataset will be partitioned into.
- `name_prefix`: the name prefix of generated files.
`reader`每次输出一个data instance这个instance可以是单个值或者用tuple表示的多个值
```python
yield 1 # 单个值
yield numpy.random.uniform(-1, 1, size=28*28) # 单个值
yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
```
每个值的类型可以是整形、浮点型数据、字符串或者由它们组成的list以及numpy.ndarray。如果是其它类型会被Pickle序列化成字符串。
### 示例程序
#### 使用转换库
以下`reader_creator`生成的`reader`每次输出一个data instance每个data instance包涵两个值numpy.ndarray类型的值和整型的值
```python
def reader_creator():
def reader():
for i in range(1000):
yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
return reader
```
把`reader_creator`生成的`reader`传入`convert`函数即可完成转换:
```python
convert("./", reader_creator(), 100, random_images)
```
以上命令会在当前目录下生成100个文件
```text
random_images-00000-of-00099
random_images-00001-of-00099
...
random_images-00099-of-00099
```
#### 进行训练
PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc)生成给定SSTable文件对应的data reader。**无论在本地还是在云端reader的使用方式都是一致的**
```python
# ...
reader = paddle.reader.creator.SSTable("/home/random_images-*-of-*")
batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
trainer.train(batch_reader, ...)
```
以上代码的reader输出的data instance与生成数据集时reader输出的data instance是一模一样的。
### 上传训练文件
使用下面命令,可以把本地的数据上传到存储集群中。
```bash
paddle cp filenames pfs://home/folder/
```
比如把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令:
```bash
paddle cp random_images-*-of-* pfs://home/
```
## TODO
### 支持用户自定义的数据预处理job

Binary file not shown.

After

Width:  |  Height:  |  Size: 179 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

Before

Width:  |  Height:  |  Size: 55 KiB

After

Width:  |  Height:  |  Size: 55 KiB

Before

Width:  |  Height:  |  Size: 38 KiB

After

Width:  |  Height:  |  Size: 38 KiB

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Before

Width:  |  Height:  |  Size: 28 KiB

After

Width:  |  Height:  |  Size: 28 KiB

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save