diff --git a/.copyright.hook b/.copyright.hook
new file mode 100644
index 0000000000..09afff2072
--- /dev/null
+++ b/.copyright.hook
@@ -0,0 +1,121 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+ Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+ NEW_LINE_MARK = "\r\n"
+else:
+ NEW_LINE_MARK = '\n'
+ COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+ p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+ process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+ date, err = process.communicate()
+ date = date.decode("utf-8").rstrip("\n")
+ COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+ if lang == 'Python':
+ LANG_COMMENT_MARK = '#'
+ else:
+ LANG_COMMENT_MARK = "//"
+
+ lines = template.split(NEW_LINE_MARK)
+ BLANK = " "
+ ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+ for lino, line in enumerate(lines):
+ if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+ if len(line) == 0:
+ BLANK = ""
+ else:
+ BLANK = " "
+ ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+ return ans + "\n"
+
+
+def lang_type(filename):
+ if filename.endswith(".py"):
+ return "Python"
+ elif filename.endswith(".h"):
+ return "C"
+ elif filename.endswith(".c"):
+ return "C"
+ elif filename.endswith(".hpp"):
+ return "C"
+ elif filename.endswith(".cc"):
+ return "C"
+ elif filename.endswith(".cpp"):
+ return "C"
+ elif filename.endswith(".cu"):
+ return "C"
+ elif filename.endswith(".cuh"):
+ return "C"
+ elif filename.endswith(".go"):
+ return "C"
+ elif filename.endswith(".proto"):
+ return "C"
+ else:
+ print("Unsupported filetype %s", filename)
+ exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+ parser = argparse.ArgumentParser(
+ description='Checker for copyright declaration.')
+ parser.add_argument('filenames', nargs='*', help='Filenames to check')
+ args = parser.parse_args(argv)
+
+ retv = 0
+ for filename in args.filenames:
+ fd = io.open(filename, encoding="utf-8")
+ first_line = fd.readline()
+ second_line = fd.readline()
+ if "COPYRIGHT (C)" in first_line.upper(): continue
+ if first_line.startswith("#!") or PYTHON_ENCODE.match(
+ second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+ continue
+ original_contents = io.open(filename, encoding="utf-8").read()
+ new_contents = generate_copyright(
+ COPYRIGHT, lang_type(filename)) + original_contents
+ print('Auto Insert Copyright Header {}'.format(filename))
+ retv = 1
+ with io.open(filename, 'w') as output_file:
+ output_file.write(new_contents)
+
+ return retv
+
+
+if __name__ == '__main__':
+ exit(main())
diff --git a/.gitignore b/.gitignore
index ac56a3320e..2badc3bdaa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,9 @@
+paddle/operators/check_t.save
+paddle/operators/check_tensor.ls
+paddle/operators/tensor.save
+python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
+python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
+python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
*.DS_Store
build/
build_doc/
@@ -21,11 +27,10 @@ third_party/
cmake-build-*
# generated while compiling
-python/paddle/v2/fluid/core.so
paddle/pybind/pybind.h
CMakeFiles
cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
-paddle/pybind/pybind.h
+paddle/fluid/pybind/pybind.h
python/paddle/version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 59661c9c1d..89c620bb2f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,3 +31,11 @@
- id: go-fmt
types:
- go
+- repo: local
+ hooks:
+ - id: copyright_checker
+ name: copyright_checker
+ entry: python ./.copyright.hook
+ language: system
+ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+ exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/.travis.yml b/.travis.yml
index e2d49daa19..bf6a41d13c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,13 +4,17 @@ cache:
- $HOME/.ccache
- $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party
+ - $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required
dist: trusty
+services:
+ - docker
os:
- linux
env:
- JOB=build_doc
- JOB=check_style
+ - JOB=build_android
addons:
apt:
packages:
@@ -41,8 +45,10 @@ before_install:
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- |
- timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
- RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
+ # 43min timeout
+ if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
+ else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
+ RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
- |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
@@ -50,7 +56,7 @@ script:
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd`
cd ..
- curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc
+ curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications:
email:
on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
index 4db4a4a8e7..9c6821d9f8 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -1,8 +1,9 @@
| Github account | name |
|---|---|
+| abhinavarora | Abhinav Arora |
| backyes | Yan-Fei Wang |
| beckett1124 | Bin Qi |
-| Canpio | Jia-Yi Feng |
+| JiayiFeng | Jia-Yi Feng |
| chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b309ff37e5..1e11f86d0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -16,14 +16,14 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
include(system)
-project(paddle CXX C Go)
-message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
-message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
+project(paddle CXX C)
+message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
+ "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
+ "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING)
@@ -31,18 +31,16 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
-if(NOT ANDROID AND NOT IOS)
- find_package(Boost QUIET)
-endif()
include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
+option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF)
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
+option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
@@ -55,12 +53,15 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
+# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option.
+option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
+option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
@@ -107,6 +108,10 @@ if (WITH_C_API AND WITH_PYTHON)
"different Python interpreter from compiling.")
endif()
+if (WITH_C_API)
+ set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+endif()
+
if(MOBILE_INFERENCE)
set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
else()
@@ -134,14 +139,17 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
+include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
-include(external/nccl)
include(external/cares)
include(external/grpc)
+include(external/snappy) # download snappy
+include(external/snappystream)
include(cudnn) # set cudnn libraries, must before configure
+include(cupti)
include(configure) # add paddle env configuration
include(generic) # simplify cmake module
include(package) # set paddle packages
@@ -152,27 +160,32 @@ include(rdma) # set rdma libraries
include(flags) # set paddle compile flags
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
+include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
-include_directories(${Boost_INCLUDE_DIRS})
set(EXTERNAL_LIBS
- ${GFLAGS_LIBRARIES}
- ${GLOG_LIBRARIES}
+ gflags
+ glog
${CBLAS_LIBRARIES}
- ${PROTOBUF_LIBRARY}
- ${ZLIB_LIBRARIES}
+ protobuf
+ zlib
${PYTHON_LIBRARIES}
)
if(WITH_GPU)
- include(cuda)
+ include(cuda)
endif(WITH_GPU)
+if(WITH_AMD_GPU)
+ find_package(HIP)
+ include(hip)
+endif(WITH_AMD_GPU)
+
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
@@ -197,13 +210,18 @@ endif()
# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
# placed after this block, because they depends on it.
if(WITH_GOLANG)
+ enable_language(Go)
add_subdirectory(go)
endif(WITH_GOLANG)
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
+
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+
add_subdirectory(paddle)
if(WITH_PYTHON)
- add_subdirectory(python)
+ add_subdirectory(python)
endif()
if(WITH_DOC)
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..54131b48ec
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,46 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at paddle-dev@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
+
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/
diff --git a/CODE_OF_CONDUCT_cn.md b/CODE_OF_CONDUCT_cn.md
new file mode 100644
index 0000000000..2be794f1f3
--- /dev/null
+++ b/CODE_OF_CONDUCT_cn.md
@@ -0,0 +1,50 @@
+# 参与者公约
+
+## 我们的保证
+
+为了促进一个开放透明且友好的环境,我们作为贡献者和维护者保证:无论年龄、种族、民族、性别认同和表达(方式)、体型、身体健全与否、经验水平、国籍、个人表现、宗教或性别取向,参与者在我们项目和社区中都免于骚扰。
+
+## 我们的标准
+
+有助于创造正面环境的行为包括但不限于:
+* 使用友好和包容性语言
+* 尊重不同的观点和经历
+* 耐心地接受建设性批评
+* 关注对社区最有利的事情
+* 友善对待其他社区成员
+
+身为参与者不能接受的行为包括但不限于:
+* 使用与性有关的言语或是图像,以及不受欢迎的性骚扰
+* 捣乱/煽动/造谣的行为或进行侮辱/贬损的评论,人身攻击及政治攻击
+* 公开或私下的骚扰
+* 未经许可地发布他人的个人资料,例如住址或是电子地址
+* 其他可以被合理地认定为不恰当或者违反职业操守的行为
+
+## 我们的责任
+
+项目维护者有责任为「可接受的行为」标准做出诠释,以及对已发生的不被接受的行为采取恰当且公平的纠正措施。
+
+项目维护者有权利及责任去删除、编辑、拒绝与本行为标准有所违背的评论(comments)、提交(commits)、代码、wiki 编辑、问题(issues)和其他贡献,以及项目维护者可暂时或永久性的禁止任何他们认为有不适当、威胁、冒犯、有害行为的贡献者。
+
+## 使用范围
+
+当一个人代表该项目或是其社区时,本行为标准适用于其项目平台和公共平台。
+
+代表项目或是社区的情况,举例来说包括使用官方项目的电子邮件地址、通过官方的社区媒体账号发布或线上或线下事件中担任指定代表。
+
+该项目的呈现方式可由其项目维护者进行进一步的定义及解释。
+
+## 强制执行
+
+可以通过paddle-dev@baidu.com,来联系项目团队来举报滥用、骚扰或其他不被接受的行为。
+
+任何维护团队认为有必要且适合的所有投诉都将进行审查及调查,并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。
+
+没有切实地遵守或是执行本行为标准的项目维护人员,可能会因项目领导人或是其他成员的决定,暂时或是永久地取消其参与资格。
+
+## 来源
+
+本行为标准改编自[贡献者公约][主页],版本 1.4
+可在此观看https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html
+
+[主页]: https://www.contributor-covenant.org
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a60453ff4e..3c36cffcb4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,8 @@
# Contribute Code
+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
We sincerely appreciate your contribution. This document explains our workflow and work style.
## Workflow
diff --git a/Dockerfile b/Dockerfile
index 857d3f3e5f..fbec88c796 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,12 +22,13 @@ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \
apt-get install -y \
- git python-pip python-dev openssh-server bison libnccl-dev \
+ git python-pip python-dev openssh-server bison \
+ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \
- liblapack-dev liblapacke-dev libboost-dev \
+ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \
apt-get clean -y
@@ -52,10 +53,14 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue.
+
+# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# version(1.7.1 for now), which causes building documentation failed.
RUN pip install --upgrade pip && \
pip install -U wheel && \
- pip install -U docopt PyYAML sphinx && \
- pip install -U sphinx-rtd-theme==0.1.9 recommonmark
+ pip install -U docopt PyYAML sphinx==1.5.6 && \
+ pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
diff --git a/Dockerfile.android b/Dockerfile.android
index 9d13a414f6..cc022d596b 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -21,16 +21,6 @@ RUN apt-get update && \
wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
apt-get clean -y
-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
- tar -xz -C /usr/local && \
- mkdir /root/gopath && \
- mkdir /root/gopath/bin && \
- mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
# git credential to skip password typing
RUN git config --global credential.helper store
diff --git a/LICENSE b/LICENSE
index e77bd090ee..5fe86943b3 100644
--- a/LICENSE
+++ b/LICENSE
@@ -188,7 +188,7 @@ Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
same "printed page" as the copyright notice for easier
identification within third-party archives.
- Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index db0fbd88b2..d06375a444 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
[](https://travis-ci.org/PaddlePaddle/Paddle)
-[](http://doc.paddlepaddle.org/develop/doc/)
-[](http://doc.paddlepaddle.org/develop/doc_cn/)
+[](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
[](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
[](https://github.com/PaddlePaddle/Paddle/releases)
[](LICENSE)
@@ -36,7 +36,8 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
examples:
- Optimized math operations through SSE/AVX intrinsics, BLAS libraries
- (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+ (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
+ - Optimized CNN networks through MKL-DNN library.
- Highly optimized recurrent networks which can handle **variable-length**
sequence without padding.
- Optimized local and distributed training for models with high dimensional
@@ -61,32 +62,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
## Installation
It is recommended to check out the
-[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
## Documentation
-We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
-[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
-- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
-- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
You can run distributed training jobs on MPI clusters.
-- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
You can also run distributed training jobs on Kubernetes clusters.
-- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
Our new API enables much shorter programs.
-- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
We appreciate your contributions!
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 8ee7fd28c5..8b7dc5b7db 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -7,11 +7,11 @@ Machine:
System: CentOS release 6.3 (Final), Docker 1.12.1.
-PaddlePaddle: (TODO: will rerun after 0.11.0)
-- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+PaddlePaddle:
+- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
- MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007
-- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
- OpenBLAS v0.2.20
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
@@ -22,6 +22,7 @@ On each machine, we will test and compare the performance of training on single
#### Training
Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
Input image size - 3 * 224 * 224, Time: images/second
@@ -55,33 +56,57 @@ Input image size - 3 * 224 * 224, Time: images/second
+- AlexNet
+
+| BatchSize | 64 | 128 | 256 |
+|--------------|--------| ------ | -------|
+| OpenBLAS | 45.62 | 72.79 | 107.22 |
+| MKLML | 66.37 | 105.60 | 144.04 |
+| MKL-DNN | 399.00 | 498.94 | 626.53 |
+
+
+
#### Inference
Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
- VGG-19
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|-------|-------|-------|-------|
-| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 |
+| OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 |
| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+
- ResNet-50
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|--------|--------|--------|--------|
-| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 |
+| OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 |
| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
- GoogLeNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 |
+| OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 |
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+
+
+- AlexNet
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 |
+| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
+| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
+
+
### Laptop
TBD
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
new file mode 100644
index 0000000000..b619613ea7
--- /dev/null
+++ b/benchmark/cluster/README.md
@@ -0,0 +1,78 @@
+# Cluster Training Benchmark
+
+## Setup
+
+- Platform
+ - Kubernetes: v1.6.2
+ - Linux Kernel: v3.10.0
+
+- Resource
+ - CPU: 10 Cores per Pod
+ - Memory: 5GB per Pod
+
+- Docker Image
+
+ We use different base Docker Image to run the benchmark on Kubernetes:
+ - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
+ - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
+ - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
+
+- Model
+ vgg16 is used in this benchmark.
+
+## Cases
+
+- Variable
+ - Batch Size of training data.
+ - PServer count of the training job.
+ - The number of trainers.
+
+- Invariant
+ - The resource of trainer/pserver Pod.
+
+### Measure the Performance for Different Batch Size
+
+- PServer Count: 40
+- Trainer Count: 100
+- Metrics: mini-batch / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure the Performance for Different PServer Count
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure Parallel Efficiency By Increasing Trainer Count
+
+- PServer Count: 20
+- Batch Size: 64
+- Metrics:
+
+$S = \div(T1, TN)$
+
+which S is the ratio of T1 over TN, training time of 1 and N trainers.
+The parallel efficiency is:
+
+$E = \div(S, N)$
+
+| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
+| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+
+## Reproduce the benchmark
+
+TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
new file mode 100644
index 0000000000..13ad8e1b62
--- /dev/null
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -0,0 +1,35 @@
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+
+# you can get mirror list here:
+# https://launchpad.net/ubuntu/+archivemirrors
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
+RUN pip install -U kubernetes opencv-python
+
+RUN pip install paddlepaddle
+# if network is slowly, you may need to add proxy here.
+# ENV https_proxy=
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+# unset proxy if it is setted.
+# ENV https_proxy=""
+
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+# so we must build one with distribute support to install in this image.
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+
+# tf k8s
+RUN pip install tensorflow==1.4.0
+ADD tf_k8s /usr/bin
+RUN chmod +x /usr/bin/tf_k8s
+ADD vgg16_tf.py /workspace/
+
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN chmod +x /usr/bin/paddle_k8s
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
new file mode 100644
index 0000000000..cd681a1a28
--- /dev/null
+++ b/benchmark/cluster/vgg16/README.md
@@ -0,0 +1,77 @@
+# Performance for Distributed vgg16
+
+## Test Result
+
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz : 2101.000
+- cache size : 20480 KB
+
+### Blas settings
+
+Setting environment variable: `MKL_NUM_THREADS=1`.
+
+### Single Node Single Thread
+
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+
+### Different Batch Size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+
+
+### Accelerate Rate
+
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+
+### Different Pserver Count
+
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+
+*The performance gap between Fuild and v2 comes from the network interference.*
+
+
+## Steps to Run the Performance Test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable Verbos Logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
new file mode 100644
index 0000000000..ee8b0763b6
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@@ -0,0 +1,72 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+ name: vgg16job-pserver
+spec:
+ replicas: 10
+ template:
+ metadata:
+ labels:
+ paddle-job-pserver: vgg16job
+ spec:
+ hostNetwork: true
+ imagePullSecrets:
+ - name: job-registry-secret
+ containers:
+ - name: pserver
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16job
+ - name: MKL_NUM_THREADS
+ value: "1"
+ - name: TRAINING_ROLE
+ value: "PSERVER"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ command: ["paddle_k8s", "start_fluid"]
+ resources:
+ requests:
+ memory: 10Gi
+ cpu: 4
+ limits:
+ memory: 10Gi
+ cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
new file mode 100644
index 0000000000..3d56caac00
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: vgg16job-trainer
+spec:
+ parallelism: 20
+ completions: 20
+ template:
+ metadata:
+ labels:
+ paddle-job: vgg16job
+ spec:
+ imagePullSecrets:
+ - name: job-registry-secret
+ hostNetwork: true
+ containers:
+ - name: trainer
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ command: ["paddle_k8s", "start_fluid"]
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16job
+ - name: TRAINING_ROLE
+ value: "TRAINER"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ resources:
+ requests:
+ memory: 40Gi
+ cpu: 2
+ limits:
+ memory: 40Gi
+ cpu: 2
+ restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
new file mode 100644
index 0000000000..4fc263d5f6
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_k8s
@@ -0,0 +1,82 @@
+#!/bin/bash
+check_trainer_ret() {
+ ret=$1
+ stdbuf -oL echo "job returned $ret...setting pod return message..."
+ stdbuf -oL echo "==============================="
+
+ if [ $ret -eq 136 ] ; then
+ echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
+ elif [ $ret -eq 139 ] ; then
+ echo "Segmentation Fault" > /dev/termination-log
+ elif [ $ret -eq 1 ] ; then
+ echo "General Error" > /dev/termination-log
+ elif [ $ret -eq 134 ] ; then
+ echo "Program Abort" > /dev/termination-log
+ fi
+ stdbuf -oL echo "termination log wroted..."
+ exit $ret
+}
+
+g_pservers=""
+g_trainers=""
+
+wait_running_pods(){
+ pserver_label="tf-job-pserver=${JOB_NAME}"
+ trainer_label="tf-job-trainer=${JOB_NAME}"
+
+ stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
+ stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
+
+ g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
+ g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
+}
+
+start_tf_pserver(){
+ wait_running_pods
+
+ label="tf-job-pserver=${JOB_NAME}"
+ pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+ cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+ --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
+
+ stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+}
+
+start_tf_trainer(){
+ wait_running_pods
+
+ label="tf-job-trainer=${JOB_NAME}"
+ trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+ cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+ --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
+
+ stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+ check_trainer_ret $?
+}
+
+start_tf(){
+ if [[ "${TF_JOB_NAME}" == "worker" ]]; then
+ start_tf_trainer
+ else
+ start_tf_pserver
+ fi
+}
+
+usage() {
+ echo "usage: tf_k8s []:"
+ echo " start_tf Start tensorflow jobs"
+}
+
+case "$1" in
+ start_tf)
+ start_tf
+ ;;
+ --help)
+ usage
+ ;;
+ *)
+ usage
+ ;;
+esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
new file mode 100644
index 0000000000..5e37c70081
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_pserver.yaml
@@ -0,0 +1,56 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+ name: vgg16job-tf-pserver
+spec:
+ replicas: 10
+ template:
+ metadata:
+ labels:
+ tf-job-pserver: vgg16job-tf
+ spec:
+ hostNetwork: true
+ imagePullSecrets:
+ - name: job-registry-secret
+ containers:
+ - name: pserver
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+ imagePullPolicy: Always
+ command: ["tf_k8s", "start_tf"]
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PORT
+ value: "32036"
+ - name: ENTRY
+ value: "python vgg16_tf.py"
+ - name: JOB_NAME
+ value: vgg16job-tf
+ - name: PSERVERS_NUM
+ value: "10"
+ - name: TF_JOB_NAME
+ value: "ps"
+ - name: TRAINERS_NUM
+ value: "20"
+ - name: BATCH_SIZE
+ value: "128"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: NUM_PASSES
+ value: "1"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ resources:
+ requests:
+ memory: 10Gi
+ cpu: 4
+ limits:
+ memory: 10Gi
+ cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
new file mode 100644
index 0000000000..08795df3ad
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_trainer.yaml
@@ -0,0 +1,58 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: vgg16job-tf-trainer
+spec:
+ parallelism: 20
+ completions: 20
+ template:
+ metadata:
+ labels:
+ tf-job-trainer: vgg16job-tf
+ spec:
+ imagePullSecrets:
+ - name: job-registry-secret
+ hostNetwork: true
+ containers:
+ - name: trainer
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+ imagePullPolicy: Always
+ command: ["tf_k8s", "start_tf"]
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PORT
+ value: "32036"
+ - name: JOB_NAME
+ value: vgg16job-tf
+ - name: TF_JOB_NAME
+ value: "worker"
+ - name: ENTRY
+ value: "python vgg16_tf.py"
+ - name: PSERVERS_NUM
+ value: "10"
+ - name: BATCH_SIZE
+ value: "128"
+ - name: TRAINERS_NUM
+ value: "20"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: NUM_PASSES
+ value: "1"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: "status.podIP"
+ resources:
+ requests:
+ memory: 40Gi
+ cpu: 2
+ limits:
+ memory: 40Gi
+ cpu: 2
+ restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
new file mode 100644
index 0000000000..dd1271e0cf
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -0,0 +1,64 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+ name: vgg16v2job-pserver
+spec:
+ replicas: 10
+ template:
+ metadata:
+ labels:
+ paddle-job-pserver: vgg16v2job
+ spec:
+ hostNetwork: true
+ imagePullSecrets:
+ - name: job-registry-secret
+ containers:
+ - name: pserver
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ ports:
+ - name: jobport-30236
+ containerPort: 30236
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16v2job
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "python train.py"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "1"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ command: ["paddle_k8s", "start_pserver"]
+ resources:
+ requests:
+ memory: 10Gi
+ cpu: 4
+ limits:
+ memory: 10Gi
+ cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
new file mode 100644
index 0000000000..12c8964066
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -0,0 +1,65 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: vgg16v2job-trainer
+spec:
+ parallelism: 20
+ completions: 20
+ template:
+ metadata:
+ labels:
+ paddle-job: vgg16v2job
+ spec:
+ imagePullSecrets:
+ - name: job-registry-secret
+ hostNetwork: true
+ containers:
+ - name: trainer
+ image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+ imagePullPolicy: Always
+ command: ["paddle_k8s", "start_trainer", "v2"]
+ env:
+ - name: PADDLE_JOB_NAME
+ value: vgg16v2job
+ - name: BATCH_SIZE
+ value: "256"
+ - name: TRAINERS
+ value: "20"
+ - name: PSERVERS
+ value: "10"
+ - name: TOPOLOGY
+ value: ""
+ - name: ENTRY
+ value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+ - name: TRAINER_PACKAGE
+ value: "/workspace"
+ - name: PADDLE_INIT_PORT
+ value: "30236"
+ - name: PADDLE_INIT_NICS
+ value: "xgbe0"
+ - name: PADDLE_INIT_TRAINER_COUNT
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM
+ value: "1"
+ - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+ value: "1"
+ - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+ value: "20"
+ - name: PADDLE_INIT_NUM_PASSES
+ value: "2"
+ - name: PADDLE_INIT_USE_GPU
+ value: "0"
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/lib:/usr/local/nvidia/lib64"
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.namespace"
+ resources:
+ requests:
+ memory: 40Gi
+ cpu: 2
+ limits:
+ memory: 40Gi
+ cpu: 2
+ restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
new file mode 100644
index 0000000000..8b29227cfa
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+import argparse
+import functools
+import os
+from paddle.fluid import debuger
+
+
+def str2bool(v):
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
+ return True
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+ return False
+ else:
+ raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+ '--learning_rate',
+ type=float,
+ default=1e-3,
+ help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+ '--device',
+ type=str,
+ default='CPU',
+ choices=['CPU', 'GPU'],
+ help="The device type.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
+parser.add_argument(
+ '--data_format',
+ type=str,
+ default='NCHW',
+ choices=['NCHW', 'NHWC'],
+ help='The data order, now only support NCHW.')
+parser.add_argument(
+ '--data_set',
+ type=str,
+ default='cifar10',
+ choices=['cifar10', 'flowers'],
+ help='Optional dataset for benchmark.')
+parser.add_argument(
+ '--local',
+ type=str2bool,
+ default=True,
+ help='Whether to run as local mode.')
+
+parser.add_argument(
+ "--ps_hosts",
+ type=str,
+ default="",
+ help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+ "--trainer_hosts",
+ type=str,
+ default="",
+ help="Comma-separated list of hostname:port pairs")
+
+# Flags for defining the tf.train.Server
+parser.add_argument(
+ "--task_index", type=int, default=0, help="Index of task within the job")
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+ def conv_block(input, num_filter, groups, dropouts):
+ return fluid.nets.img_conv_group(
+ input=input,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act='relu',
+ conv_with_batchnorm=True,
+ conv_batchnorm_drop_rate=dropouts,
+ pool_type='max')
+
+ conv1 = conv_block(input, 64, 2, [0.3, 0])
+ conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+ conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+ conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+ conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+ drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+ fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
+ bn = fluid.layers.batch_norm(input=fc1, act='relu')
+ drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+ fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
+ return fc2
+
+
+def main():
+ if args.data_set == "cifar10":
+ classdim = 10
+ if args.data_format == 'NCHW':
+ data_shape = [3, 32, 32]
+ else:
+ data_shape = [32, 32, 3]
+ else:
+ classdim = 102
+ if args.data_format == 'NCHW':
+ data_shape = [3, 224, 224]
+ else:
+ data_shape = [224, 224, 3]
+
+ # Input data
+ images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ # Train program
+ net = vgg16_bn_drop(images)
+ predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ batch_size = fluid.layers.create_tensor(dtype='int64')
+ batch_acc = fluid.layers.accuracy(
+ input=predict, label=label, total=batch_size)
+
+ # inference program
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(batch_acc)
+
+ # Optimization
+ optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+ optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+ # Initialize executor
+ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+ args.device_id)
+ exe = fluid.Executor(place)
+
+ # test
+ def test(exe):
+ test_pass_acc = fluid.average.WeightedAverage()
+ for batch_id, data in enumerate(test_reader()):
+ img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+ data)).astype("float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ outs = exe.run(inference_program,
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[batch_acc, batch_size])
+ test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
+
+ return test_pass_acc.eval()
+
+ def train_loop(exe, trainer_prog):
+ iters = 0
+ ts = time.time()
+ train_pass_acc = fluid.average.WeightedAverage()
+ for pass_id in range(args.num_passes):
+ # train
+ start_time = time.time()
+ num_samples = 0
+ train_pass_acc.reset()
+ for batch_id, data in enumerate(train_reader()):
+ ts = time.time()
+ img_data = np.array(
+ map(lambda x: x[0].reshape(data_shape), data)).astype(
+ "float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ loss, acc, b_size = exe.run(
+ trainer_prog,
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[avg_cost, batch_acc, batch_size])
+ iters += 1
+ num_samples += len(data)
+ train_pass_acc.add(value=acc, weight=b_size)
+ print(
+ "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+ % (pass_id, iters, loss, acc,
+ len(data) / (time.time() - ts))
+ ) # The accuracy is the accumulation of batches, but not the current batch.
+
+ pass_elapsed = time.time() - start_time
+ pass_train_acc = train_pass_acc.eval()
+ pass_test_acc = test(exe)
+ print(
+ "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+ % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+ pass_test_acc))
+
+ if args.local:
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+ else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+ train_loop(exe, fluid.default_main_program())
+ else:
+ trainers = int(os.getenv("TRAINERS")) # total trainer count
+ print("trainers total: ", trainers)
+
+ training_role = os.getenv(
+ "TRAINING_ROLE",
+ "TRAINER") # get the training role: trainer/pserver
+
+ t = fluid.DistributeTranspiler()
+ t.transpile(
+ optimize_ops,
+ params_grads,
+ trainer_id=args.task_index,
+ pservers=args.ps_hosts,
+ trainers=trainers)
+
+ if training_role == "PSERVER":
+ current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
+ "PADDLE_INIT_PORT")
+ if not current_endpoint:
+ print("need env SERVER_ENDPOINT")
+ exit(1)
+ pserver_prog = t.get_pserver_program(current_endpoint)
+ pserver_startup = t.get_startup_program(current_endpoint,
+ pserver_prog)
+ exe.run(pserver_startup)
+ exe.run(pserver_prog)
+ elif training_role == "TRAINER":
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+ else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+ paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+
+ trainer_prog = t.get_trainer_program()
+ feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+ # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+ exe.run(fluid.default_startup_program())
+ train_loop(exe, trainer_prog)
+ else:
+ print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+def print_arguments():
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+ print_arguments()
+ main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
new file mode 100644
index 0000000000..2d220478ac
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow
+You can get distribution example template structure here:
+https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
+https://www.tensorflow.org/deploy/distributed
+"""
+
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+ '--learning_rate',
+ type=float,
+ default=1e-3,
+ help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+ '--device',
+ type=str,
+ default='CPU',
+ choices=['CPU', 'GPU'],
+ help="The device type.")
+parser.add_argument(
+ '--data_format',
+ type=str,
+ default='NHWC',
+ choices=['NCHW', 'NHWC'],
+ help='The data order, NCHW=[batch, channels, height, width].'
+ 'Only support NHWC right now.')
+parser.add_argument(
+ '--data_set',
+ type=str,
+ default='cifar10',
+ choices=['cifar10', 'flowers'],
+ help='Optional dataset for benchmark.')
+
+parser.add_argument(
+ "--ps_hosts",
+ type=str,
+ default="",
+ help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+ "--worker_hosts",
+ type=str,
+ default="",
+ help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+ "--job_name", type=str, default="", help="One of 'worker', 'ps'")
+# Flags for defining the tf.train.Server
+parser.add_argument(
+ "--task_index", type=int, default=0, help="Index of task within the job")
+
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+ def __init__(self):
+ self.parameters = []
+
+ def batch_norm_relu(self, inputs, is_training):
+ """Performs a batch normalization followed by a ReLU."""
+ # We set fused=True for a significant speed boost. See
+ # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+ inputs = tf.layers.batch_normalization(
+ inputs=inputs,
+ axis=1 if args.data_format == 'NCHW' else -1,
+ momentum=0.9,
+ epsilon=1e-05,
+ center=True,
+ scale=True,
+ training=is_training,
+ fused=True)
+ inputs = tf.nn.relu(inputs)
+ return inputs
+
+ def conv_bn_layer(self,
+ name,
+ images,
+ kernel_shape,
+ is_training,
+ drop_rate=0.0):
+ with tf.name_scope(name) as scope:
+ kernel = tf.Variable(
+ tf.truncated_normal(
+ kernel_shape, dtype=tf.float32, stddev=1e-1),
+ name='weights')
+ conv = tf.nn.conv2d(
+ images,
+ kernel, [1, 1, 1, 1],
+ data_format=args.data_format,
+ padding='SAME')
+ biases = tf.Variable(
+ tf.constant(
+ 0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+ trainable=True,
+ name='biases')
+ out = tf.nn.bias_add(conv, biases)
+ out = self.batch_norm_relu(out, is_training)
+ out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+ return out
+
+ def fc_layer(self, name, inputs, shape):
+ with tf.name_scope(name) as scope:
+ fc_w = tf.Variable(
+ tf.truncated_normal(
+ shape, dtype=tf.float32, stddev=1e-1),
+ name='weights')
+ fc_b = tf.Variable(
+ tf.constant(
+ 0.0, shape=[shape[-1]], dtype=tf.float32),
+ trainable=True,
+ name='biases')
+ out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+ return out
+
+ def network(self, images, class_dim, is_training):
+ """ VGG16 model structure.
+
+ TODO(kuke): enable this network to support the 'NCHW' data format
+ """
+
+ # conv1
+ conv1_1 = self.conv_bn_layer(
+ 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+ conv1_2 = self.conv_bn_layer(
+ 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+ # pool1
+ pool1 = tf.nn.max_pool(
+ conv1_2,
+ ksize=[1, 2, 2, 1],
+ strides=[1, 2, 2, 1],
+ padding='SAME',
+ name='pool1')
+ # conv2
+ conv2_1 = self.conv_bn_layer(
+ 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+ conv2_2 = self.conv_bn_layer(
+ 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+ # pool2
+ pool2 = tf.nn.max_pool(
+ conv2_2,
+ ksize=[1, 2, 2, 1],
+ strides=[1, 2, 2, 1],
+ padding='SAME',
+ name='pool2')
+ # conv3
+ conv3_1 = self.conv_bn_layer(
+ 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+ conv3_2 = self.conv_bn_layer(
+ 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+ conv3_3 = self.conv_bn_layer(
+ 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+ # pool3
+ pool3 = tf.nn.max_pool(
+ conv3_3,
+ ksize=[1, 2, 2, 1],
+ strides=[1, 2, 2, 1],
+ padding='SAME',
+ name='pool3')
+ # conv4
+ conv4_1 = self.conv_bn_layer(
+ 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+ conv4_2 = self.conv_bn_layer(
+ 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+ conv4_3 = self.conv_bn_layer(
+ 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+ # pool4
+ pool4 = tf.nn.max_pool(
+ conv4_3,
+ ksize=[1, 2, 2, 1],
+ strides=[1, 2, 2, 1],
+ padding='SAME',
+ name='pool4')
+ # conv5
+ conv5_1 = self.conv_bn_layer(
+ 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+ conv5_2 = self.conv_bn_layer(
+ 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+ conv5_3 = self.conv_bn_layer(
+ 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+ # pool5
+ pool5 = tf.nn.max_pool(
+ conv5_3,
+ ksize=[1, 2, 2, 1],
+ strides=[1, 2, 2, 1],
+ padding='SAME',
+ name='pool4')
+ # flatten
+ shape = int(np.prod(pool5.get_shape()[1:]))
+ pool5_flat = tf.reshape(pool5, [-1, shape])
+ # fc1
+ drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+ fc1 = self.fc_layer('fc1', drop, [shape, 512])
+ # fc2
+ bn = self.batch_norm_relu(fc1, is_training)
+ drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+ fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+ fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+ return fc3
+
+
+def run_benchmark(cluster_spec, server):
+ """Run benchmark on cifar10 or flowers."""
+
+ if args.data_set == "cifar10":
+ class_dim = 10
+ raw_shape = (3, 32, 32)
+ dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+ None, 3, 32, 32)
+ else:
+ class_dim = 102
+ raw_shape = (3, 224, 224)
+ dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+ None, 3, 224, 224)
+
+ device = tf.train.replica_device_setter(
+ worker_device="/job:worker/task:{}".format(args.task_index),
+ cluster=cluster_spec)
+
+ with tf.device(device):
+ images = tf.placeholder(tf.float32, shape=dat_shape)
+ labels = tf.placeholder(tf.int64, shape=(None, ))
+ is_training = tf.placeholder('bool')
+ onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+ vgg16 = VGG16Model()
+ logits = vgg16.network(images, class_dim, is_training)
+ loss = tf.losses.softmax_cross_entropy(
+ onehot_labels=onehot_labels, logits=logits)
+ avg_loss = tf.reduce_mean(loss)
+
+ correct = tf.equal(tf.argmax(logits, 1), labels)
+ accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+ optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+ update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+ global_step = tf.Variable(0, name='global_step', trainable=False)
+ with tf.control_dependencies(update_ops):
+ train_op = optimizer.minimize(avg_loss, global_step=global_step)
+
+ summary_op = tf.summary.merge_all()
+ init_op = tf.global_variables_initializer()
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.test10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+
+ # test
+ def test():
+ test_accs = []
+ for batch_id, data in enumerate(test_reader()):
+ test_images = np.array(
+ map(lambda x: np.transpose(x[0].reshape(raw_shape),
+ axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+ test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+ test_accs.append(
+ accuracy.eval(feed_dict={
+ images: test_images,
+ labels: test_labels,
+ is_training: False
+ }))
+ return np.mean(test_accs)
+
+ config = tf.ConfigProto(
+ intra_op_parallelism_threads=1,
+ inter_op_parallelism_threads=1,
+ log_device_placement=True)
+ config.gpu_options.allow_growth = True
+
+ hooks = [tf.train.StopAtStepHook(last_step=1000000)]
+
+ with tf.train.MonitoredTrainingSession(
+ master=server.target,
+ is_chief=(args.task_index == 0),
+ hooks=hooks,
+ config=config) as sess:
+ iters, num_samples, start_time = 0, 0, 0.0
+ for pass_id in range(args.num_passes):
+ # train
+ num_samples = 0
+ start_time = time.time()
+ for batch_id, data in enumerate(train_reader()):
+ train_images = np.array(
+ map(lambda x: np.transpose(x[0].reshape(raw_shape),
+ axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+ train_labels = np.array(map(lambda x: x[1], data)).astype(
+ 'int64')
+ iter_begin_time = time.time()
+ _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+ feed_dict={
+ images: train_images,
+ labels: train_labels,
+ is_training: True
+ })
+ iters += 1
+ print(
+ "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
+ % (pass_id, iters, loss, acc,
+ len(data) / (time.time() - iter_begin_time)))
+ num_samples += len(data)
+ train_elapsed = time.time() - start_time
+ # test
+ pass_test_acc = test()
+ print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+ (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+ print_arguments()
+
+ ps_hosts = args.ps_hosts.split(",")
+ worker_hosts = args.worker_hosts.split(",")
+
+ # Create a cluster from the parameter server and worker hosts.
+ cluster_spec = tf.train.ClusterSpec({
+ "ps": ps_hosts,
+ "worker": worker_hosts
+ })
+
+ # Create and start a server for the local task.
+ server = tf.train.Server(
+ cluster_spec, job_name=args.job_name, task_index=args.task_index)
+
+ if args.job_name == "ps":
+ print("start pserver")
+ server.join()
+ elif args.job_name == "worker":
+ print("start worker")
+ run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
new file mode 100644
index 0000000000..1a66af32d7
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import gzip
+
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+ BATCH_SIZE = int(BATCH_SIZE)
+else:
+ BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+
+
+def vgg(input, nums, class_dim):
+ def conv_block(input, num_filter, groups, num_channels=None):
+ return paddle.networks.img_conv_group(
+ input=input,
+ num_channels=num_channels,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act=paddle.activation.Relu(),
+ pool_type=paddle.pooling.Max())
+
+ assert len(nums) == 5
+ # the channel of input feature is 3
+ conv1 = conv_block(input, 64, nums[0], 3)
+ conv2 = conv_block(conv1, 128, nums[1])
+ conv3 = conv_block(conv2, 256, nums[2])
+ conv4 = conv_block(conv3, 512, nums[3])
+ conv5 = conv_block(conv4, 512, nums[4])
+
+ fc_dim = 512
+ fc1 = paddle.layer.fc(input=conv5,
+ size=fc_dim,
+ act=paddle.activation.Relu(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5))
+ fc2 = paddle.layer.fc(input=fc1,
+ size=fc_dim,
+ act=paddle.activation.Relu(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5))
+ out = paddle.layer.fc(input=fc2,
+ size=class_dim,
+ act=paddle.activation.Softmax())
+ return out
+
+
+def vgg13(input, class_dim):
+ nums = [2, 2, 2, 2, 2]
+ return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+ nums = [2, 2, 3, 3, 3]
+ return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+ nums = [2, 2, 4, 4, 4]
+ return vgg(input, nums, class_dim)
+
+
+def main():
+ global ts
+ paddle.init(use_gpu=False)
+ image = paddle.layer.data(
+ name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+ lbl = paddle.layer.data(
+ name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+ extra_layers = None
+ # NOTE: for v2 distributed training need averaging updates.
+ learning_rate = 1e-3 / NODE_COUNT
+ out = vgg16(image, class_dim=CLASS_DIM)
+ cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+ # Create parameters
+ parameters = paddle.parameters.create(cost)
+
+ # Create optimizer
+ optimizer = paddle.optimizer.Momentum(
+ momentum=0.9,
+ regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+ BATCH_SIZE),
+ learning_rate=learning_rate / BATCH_SIZE,
+ learning_rate_decay_a=0.1,
+ learning_rate_decay_b=128000 * 35,
+ learning_rate_schedule="discexp", )
+
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ cifar.train10(),
+ # To use other data, replace the above line with:
+ # reader.train_reader('train.list'),
+ buf_size=1000),
+ batch_size=BATCH_SIZE)
+ test_reader = paddle.batch(
+ cifar.test10(),
+ # To use other data, replace the above line with:
+ # reader.test_reader('val.list'),
+ batch_size=BATCH_SIZE)
+
+ # Create trainer
+ trainer = paddle.trainer.SGD(cost=cost,
+ parameters=parameters,
+ update_equation=optimizer,
+ extra_layers=extra_layers,
+ is_local=False)
+
+ # End batch and end pass event handler
+ def event_handler(event):
+ global ts, ts_pass
+ if isinstance(event, paddle.event.BeginPass):
+ ts_pass = time.time()
+ if isinstance(event, paddle.event.BeginIteration):
+ ts = time.time()
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 1 == 0:
+ print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics,
+ time.time() - ts)
+ if isinstance(event, paddle.event.EndPass):
+ print "Pass %d end, spent: %f" % (event.pass_id,
+ time.time() - ts_pass)
+ result = trainer.test(reader=test_reader)
+ print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+ trainer.train(
+ reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/benchmark/figs/alexnet-cpu-infer.png b/benchmark/figs/alexnet-cpu-infer.png
new file mode 100644
index 0000000000..6215ae4e42
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-infer.png differ
diff --git a/benchmark/figs/alexnet-cpu-train.png b/benchmark/figs/alexnet-cpu-train.png
new file mode 100644
index 0000000000..b3200bbc04
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-train.png differ
diff --git a/benchmark/figs/googlenet-cpu-infer.png b/benchmark/figs/googlenet-cpu-infer.png
new file mode 100644
index 0000000000..19478d433b
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-infer.png differ
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
index c3f67faf09..4e86e058d0 100644
Binary files a/benchmark/figs/googlenet-cpu-train.png and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-infer.png b/benchmark/figs/resnet-cpu-infer.png
new file mode 100644
index 0000000000..bc43d4b8d2
Binary files /dev/null and b/benchmark/figs/resnet-cpu-infer.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
index b96ecd5ff9..96746b1759 100644
Binary files a/benchmark/figs/resnet-cpu-train.png and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-infer.png b/benchmark/figs/vgg-cpu-infer.png
new file mode 100644
index 0000000000..3a51ec6c47
Binary files /dev/null and b/benchmark/figs/vgg-cpu-infer.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
index f830ca6a87..6d548cfd59 100644
Binary files a/benchmark/figs/vgg-cpu-train.png and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 3358d43a4b..9efc3f0494 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
from paddle.trainer_config_helpers import *
@@ -6,10 +18,24 @@ height = 227
width = 227
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+ 'height': height,
+ 'width': width,
+ 'color': True,
+ 'num_class': num_class,
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
+}
define_py_data_sources2(
- "train.list", None, module="provider", obj="process", args=args)
+ "train.list" if not is_infer else None,
+ "test.list" if is_infer else None,
+ module="provider",
+ obj="process",
+ args=args)
settings(
batch_size=batch_size,
@@ -31,7 +57,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2
net = img_conv_layer(
- input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+ input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
@@ -40,11 +66,11 @@ net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4
net = img_conv_layer(
- input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+ input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
# conv5
net = img_conv_layer(
- input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+ input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer(
@@ -59,6 +85,9 @@ net = fc_layer(
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+ outputs(net)
+else:
+ lab = data_layer('label', num_class)
+ loss = cross_entropy(input=net, label=lab)
+ outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index 7059c13bd2..2a850ccb7f 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py
new file mode 100644
index 0000000000..8679d4f272
--- /dev/null
+++ b/benchmark/paddle/image/plotlog.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import argparse
+import matplotlib.pyplot as plt
+
+
+def parse_args():
+ parser = argparse.ArgumentParser('Parse Log')
+ parser.add_argument(
+ '--file_path', '-f', type=str, help='the path of the log file')
+ parser.add_argument(
+ '--sample_rate',
+ '-s',
+ type=float,
+ default=1.0,
+ help='the rate to take samples from log')
+ parser.add_argument(
+ '--log_period', '-p', type=int, default=1, help='the period of log')
+
+ args = parser.parse_args()
+ return args
+
+
+def parse_file(file_name):
+ loss = []
+ error = []
+ with open(file_name) as f:
+ for i, line in enumerate(f):
+ line = line.strip()
+ if not line.startswith('pass'):
+ continue
+ line_split = line.split(' ')
+ if len(line_split) != 5:
+ continue
+
+ loss_str = line_split[2][:-1]
+ cur_loss = float(loss_str.split('=')[-1])
+ loss.append(cur_loss)
+
+ err_str = line_split[3][:-1]
+ cur_err = float(err_str.split('=')[-1])
+ error.append(cur_err)
+
+ accuracy = [1.0 - err for err in error]
+
+ return loss, accuracy
+
+
+def sample(metric, sample_rate):
+ interval = int(1.0 / sample_rate)
+ if interval > len(metric):
+ return metric[:1]
+
+ num = len(metric) / interval
+ idx = [interval * i for i in range(num)]
+ metric_sample = [metric[id] for id in idx]
+ return metric_sample
+
+
+def plot_metric(metric,
+ batch_id,
+ graph_title,
+ line_style='b-',
+ line_label='y',
+ line_num=1):
+ plt.figure()
+ plt.title(graph_title)
+ if line_num == 1:
+ plt.plot(batch_id, metric, line_style, label=line_label)
+ else:
+ for i in range(line_num):
+ plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
+ plt.xlabel('batch')
+ plt.ylabel(graph_title)
+ plt.legend()
+ plt.savefig(graph_title + '.jpg')
+ plt.close()
+
+
+def main():
+ args = parse_args()
+ assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
+
+ loss, accuracy = parse_file(args.file_path)
+ batch = [args.log_period * i for i in range(len(loss))]
+
+ batch_sample = sample(batch, args.sample_rate)
+ loss_sample = sample(loss, args.sample_rate)
+ accuracy_sample = sample(accuracy, args.sample_rate)
+
+ plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
+ plot_metric(
+ accuracy_sample,
+ batch_sample,
+ 'accuracy',
+ line_style='g-',
+ line_label='accuracy')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 927b175994..6ad817ccef 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import io, os
import random
import numpy as np
@@ -14,6 +28,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
else:
settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False)
+ settings.num_samples = kwargs.get('num_samples', 2560)
if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)]
else:
@@ -23,7 +38,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
- for i in xrange(2560 if settings.is_infer else 1024):
+ for i in xrange(settings.num_samples):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer:
yield img.astype('float32')
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
index 4a14363ff1..2846e4763f 100644
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
similarity index 95%
rename from benchmark/paddle/image/run_mkldnn_infer.sh
rename to benchmark/paddle/image/run_mkl_infer.sh
index d795bcab1b..62c9bf6efd 100755
--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -37,7 +37,7 @@ function infer() {
--trainer_count=1 \
--num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \
- --config_args="batch_size=128,layer_num=${layer_num}" \
+ --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
> /dev/null 2>&1
echo "Done"
fi
@@ -79,8 +79,9 @@ fi
# inference benchmark
for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do
- infer googlenet v1 $batchsize $use_mkldnn
- infer resnet 50 $batchsize $use_mkldnn
infer vgg 19 $batchsize $use_mkldnn
+ infer resnet 50 $batchsize $use_mkldnn
+ infer googlenet v1 $batchsize $use_mkldnn
+ infer alexnet 2 $batchsize $use_mkldnn
done
done
diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkl_train.sh
similarity index 83%
rename from benchmark/paddle/image/run_mkldnn_train.sh
rename to benchmark/paddle/image/run_mkl_train.sh
index 320206239a..03d2d378fb 100755
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -28,6 +28,10 @@ function train() {
--test_period=100 \
--config_args=$args \
2>&1 | tee ${log}
+
+ avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+ fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
@@ -43,5 +47,6 @@ for use_mkldnn in True False; do
train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
+ train alexnet 2 $batchsize $use_mkldnn
done
done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
new file mode 100755
index 0000000000..a9a7b8a667
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -0,0 +1,69 @@
+set -e
+
+function clock_to_seconds() {
+ hours=`echo $1 | awk -F ':' '{print $1}'`
+ mins=`echo $1 | awk -F ':' '{print $2}'`
+ secs=`echo $1 | awk -F ':' '{print $3}'`
+ echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+ export OPENBLAS_MAIN_FREE=1
+ topology=$1
+ layer_num=$2
+ bs=$3
+ trainers=`nproc`
+ if [ $trainers -gt $bs ]; then
+ trainers=$bs
+ fi
+ log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
+ threads=$((`nproc` / trainers))
+ if [ $threads -eq 0 ]; then
+ threads=1
+ fi
+ export OPENBLAS_NUM_THREADS=$threads
+
+ models_in="models/${topology}-${layer_num}/pass-00000/"
+ if [ ! -d $models_in ]; then
+ echo "./run_mkl_infer.sh to save the model first"
+ exit 0
+ fi
+ log_period=$((32 / bs))
+ paddle train --job=test \
+ --config="${topology}.py" \
+ --use_mkldnn=False \
+ --use_gpu=False \
+ --trainer_count=$trainers \
+ --log_period=$log_period \
+ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
+ --init_model_path=$models_in \
+ 2>&1 | tee ${log}
+
+ # calculate the last 5 logs period time of 160(=32*5) samples,
+ # the time before are burning time.
+ start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+ end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+ start_sec=`clock_to_seconds $start`
+ end_sec=`clock_to_seconds $end`
+ fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+ echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+ echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+ echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+ mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+ infer vgg 19 $batchsize
+ infer resnet 50 $batchsize
+ infer googlenet v1 $batchsize
+ infer alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
new file mode 100755
index 0000000000..935cff6f2c
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -0,0 +1,41 @@
+set -e
+
+function train() {
+ export OPENBLAS_NUM_THREADS=1
+ topology=$1
+ layer_num=$2
+ bs=$3
+ thread=`nproc`
+ # each trainer_count use only 1 core to avoid conflict
+ log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+ args="batch_size=${bs},layer_num=${layer_num}"
+ config="${topology}.py"
+ paddle train --job=time \
+ --config=$config \
+ --use_mkldnn=False \
+ --use_gpu=False \
+ --trainer_count=$thread \
+ --log_period=3 \
+ --test_period=30 \
+ --config_args=$args \
+ 2>&1 | tee ${log}
+
+ avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+ fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+ echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+ mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+ train vgg 19 $batchsize
+ train resnet 50 $batchsize
+ train googlenet v1 $batchsize
+ train alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index 8d0a1e97a4..ca0a6798fb 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
index fc4ed4025f..2a67f9b0cf 100755
--- a/benchmark/paddle/rnn/imdb.py
+++ b/benchmark/paddle/rnn/imdb.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from __future__ import print_function
import six.moves.cPickle as pickle
import gzip
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
index 928ca75daf..23cc0c44a9 100644
--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import io, os
import random
import numpy as np
diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py
index f6a39ef778..95728b7a85 100644
--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py
index 7b5ee78f4d..51dfe3f1cb 100644
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py
index decf855b54..37b2ba6911 100644
--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from six.moves import xrange
from datetime import datetime
import math
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
index 31466faa37..7179c5301c 100644
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
index 1a625134a6..2ca1623b6b 100644
--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py
index f538329a15..ac08c10a42 100755
--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import os.path
import io
import numpy as np
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 13294c0548..6320b17520 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
#
# If any cblas implementation found, the following variable will be set.
-# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+# CBLAS_PROVIDER # one of MKLML, OPENBLAS, REFERENCE
# CBLAS_INC_DIR # the include directory for cblas.
# CBLAS_LIBS # a list of libraries should be linked by paddle.
# # Each library should be full path to object file.
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
return()
endif()
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
- ${ATLAS_ROOT}/include
- /usr/include
- /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
- ${ATLAS_ROOT}/lib
- /usr/lib
- /usr/lib/blas/atlas
- /usr/lib/atlas
- /usr/lib/atlas-base # special for ubuntu 14.04.
- )
-find_path(ATLAS_INC_DIR NAMES cblas.h
- PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
- PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
- PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
- PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
- set(CBLAS_FOUND ON)
- set(CBLAS_PROVIDER ATLAS)
- set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
- set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-
- add_definitions(-DPADDLE_USE_ATLAS)
- add_definitions(-DLAPACK_FOUND)
-
- message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
- message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
- return()
-endif()
-
## Then find openblas.
set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
set(OPENBLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 5c6bcfde76..f726405c47 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)
-if(NOT WITH_GPU)
- add_definitions(-DHPPL_STUB_FUNC)
-
- list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
+if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA)
FIND_PACKAGE(CUDA REQUIRED)
@@ -73,13 +69,25 @@ else()
if(NOT CUDNN_FOUND)
message(FATAL_ERROR "Paddle needs cudnn to compile")
endif()
-
+ if(CUPTI_FOUND)
+ include_directories(${CUPTI_INCLUDE_DIR})
+ add_definitions(-DPADDLE_WITH_CUPTI)
+ else()
+ message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
+ endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
# Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+elseif(WITH_AMD_GPU)
+ add_definitions(-DPADDLE_WITH_HIP)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+ add_definitions(-DHPPL_STUB_FUNC)
+ list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()
if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 84219cfa55..4cf2be3bdf 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake
index 14c35266ec..f9c6b12136 100644
--- a/cmake/cross_compiling/host.cmake
+++ b/cmake/cross_compiling/host.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index d3f5bf6852..10d389ec8e 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake
index 817b39f683..0425b2ae15 100644
--- a/cmake/cross_compiling/raspberry_pi.cmake
+++ b/cmake/cross_compiling/raspberry_pi.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 6bea7cf302..7edc863772 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -155,7 +155,8 @@ endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
- list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+ # TODO(panyx0718): CUPTI only allows DSO?
+ list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
# setting nvcc arch flags
@@ -181,7 +182,8 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
- list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
+ # nvcc 9 does not support -Os. Use Release flags instead
+ list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
new file mode 100644
index 0000000000..72ed0f1e58
--- /dev/null
+++ b/cmake/cupti.cmake
@@ -0,0 +1,41 @@
+if(NOT WITH_GPU)
+ return()
+endif()
+
+
+set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
+find_path(CUPTI_INCLUDE_DIR cupti.h
+ PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
+ $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
+ ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
+ NO_DEFAULT_PATH
+ )
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+ set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(APPEND CUPTI_CHECK_LIBRARY_DIRS
+ ${CUPTI_ROOT}
+ ${CUPTI_ROOT}/lib64
+ ${CUPTI_ROOT}/lib
+ ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+ $ENV{CUPTI_ROOT}
+ $ENV{CUPTI_ROOT}/lib64
+ $ENV{CUPTI_ROOT}/lib
+ /usr/lib
+ ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
+find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
+ PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
+ NO_DEFAULT_PATH
+ DOC "Path to cuPTI library.")
+
+get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
+if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
+ set(CUPTI_FOUND ON)
+else()
+ set(CUPTI_FOUND OFF)
+endif()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
new file mode 100644
index 0000000000..10662fc967
--- /dev/null
+++ b/cmake/external/boost.cmake
@@ -0,0 +1,59 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(BOOST_PROJECT "extern_boost")
+# To release PaddlePaddle as a pip package, we have to follow the
+# manylinux1 standard, which features as old Linux kernels and
+# compilers as possible and recommends CentOS 5. Indeed, the earliest
+# CentOS version that works with NVIDIA CUDA is CentOS 6. And a new
+# version of boost, say, 1.66.0, doesn't build on CentOS 6. We
+# checked that the devtools package of CentOS 6 installs boost 1.41.0.
+# So we use 1.41.0 here.
+set(BOOST_VER "1.41.0")
+set(BOOST_TAR "boost_1_41_0")
+set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
+set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
+set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
+set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
+
+include_directories(${BOOST_INCLUDE_DIR})
+
+ExternalProject_Add(
+ ${BOOST_PROJECT}
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
+ DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
+ && tar zxf ${BOOST_TAR}.tar.gz
+ DOWNLOAD_NO_PROGRESS 1
+ PREFIX ${BOOST_SOURCES_DIR}
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND ""
+ UPDATE_COMMAND ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+ set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+ file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+ add_library(boost STATIC ${dummyfile})
+else()
+ add_library(boost INTERFACE)
+endif()
+
+add_dependencies(boost ${BOOST_PROJECT})
+list(APPEND external_project_dependencies boost)
+set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
index aec51410b3..a743b572a6 100644
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 96fc886a34..73d70c34dc 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -1,25 +1,40 @@
INCLUDE(ExternalProject)
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
+SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
+INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
-INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
-
-ExternalProject_Add(
- extern_eigen3
- ${EXTERNAL_PROJECT_LOG_ARGS}
- GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
- GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10
- PREFIX ${EIGEN_SOURCE_DIR}
- UPDATE_COMMAND ""
- CONFIGURE_COMMAND ""
- BUILD_COMMAND ""
- INSTALL_COMMAND ""
- TEST_COMMAND ""
-)
+if(WITH_AMD_GPU)
+ ExternalProject_Add(
+ extern_eigen3
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git"
+ GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+ PREFIX ${EIGEN_SOURCE_DIR}
+ UPDATE_COMMAND ""
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND ""
+ TEST_COMMAND ""
+ )
+else()
+ ExternalProject_Add(
+ extern_eigen3
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
+ GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10
+ PREFIX ${EIGEN_SOURCE_DIR}
+ UPDATE_COMMAND ""
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND ""
+ TEST_COMMAND ""
+ )
+endif()
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
- file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
+ file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
add_library(eigen3 STATIC ${dummyfile})
else()
add_library(eigen3 INTERFACE)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index d4f252bb9f..a1d2d0f446 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 0c6b3aafcb..ac0181e69c 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index abee6698e3..0853b98181 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git"
- GIT_TAG "v1.7.x"
+ GIT_TAG "v1.8.x"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 5a4aa7a5b7..d335298742 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5d24caebdc..a25cff5fc5 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -56,6 +56,7 @@ ExternalProject_Add(
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+ CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
@@ -63,9 +64,30 @@ ExternalProject_Add(
-DMKLROOT:PATH=${MKLML_ROOT}
)
-ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
+ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN)
-LIST(APPEND external_project_dependencies mkldnn)
+LIST(APPEND external_project_dependencies shared_mkldnn)
+
+# generate a static dummy target to track mkldnn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ADD_LIBRARY(mkldnn STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+# copy the real so.0 lib to install dir
+# it can be directly contained in wheel or capi
+SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+ COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+ DEPENDS mkldnn)
+ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
+
+IF(WITH_C_API)
+ INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
+ENDIF()
+
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 20dbc32a73..df3f0c7f0c 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
-SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n"
- "install(DIRECTORY ${MKLML_VER}\n"
+ "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
" DESTINATION ${MKLML_DST_DIR})\n")
ExternalProject_Add(
@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
+
+IF(WITH_C_API)
+ INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
index fc43766efa..af5c689c35 100644
--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 97857a686b..8af2765f58 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND})
CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+ SET(OPENBLAS_COMMIT "v0.2.20")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID)
- # arm_soft_fp_abi branch of OpenBLAS to support softfp
- # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
- SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+ # use softfp
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
- SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
@@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
ELSEIF(RPI)
# use hardfp
- SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
- SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
@@ -81,7 +77,8 @@ IF(NOT ${CBLAS_FOUND})
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 1
BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
- INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=
+ INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=
+ && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
@@ -113,7 +110,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
-FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index fab2af362b..0fde4373a4 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
- INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+ INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
ELSE()
- INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+ INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
ENDIF()
ENDIF()
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 4e87dc49d8..c885877a2b 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 46c68cce32..d7e5571bdb 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
new file mode 100644
index 0000000000..71f54c425d
--- /dev/null
+++ b/cmake/external/snappy.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+ return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: snappy is needed when linking with recordio
+
+SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+
+ExternalProject_Add(
+ extern_snappy
+ GIT_REPOSITORY "https://github.com/google/snappy"
+ GIT_TAG "1.1.7"
+ PREFIX ${SNAPPY_SOURCES_DIR}
+ UPDATE_COMMAND ""
+ CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+ -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+ -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+ -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+ -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+ -DBUILD_TESTING=OFF
+ -DSNAPPY_BUILD_TESTS:BOOL=OFF
+ -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+ ${EXTERNAL_OPTIONAL_ARGS}
+ CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
+ -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
+ -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+ -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+ BUILD_COMMAND make -j8
+ INSTALL_COMMAND make install
+)
+
+add_library(snappy STATIC IMPORTED GLOBAL)
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
+ "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+
+include_directories(${SNAPPY_INCLUDE_DIR})
+add_dependencies(snappy extern_snappy)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
new file mode 100644
index 0000000000..5377a0b046
--- /dev/null
+++ b/cmake/external/snappystream.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+ return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: snappy is needed when linking with recordio
+
+SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+
+ExternalProject_Add(
+ extern_snappystream
+ GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+ GIT_TAG "0.2.8"
+ PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
+ UPDATE_COMMAND ""
+ CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+ -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+ -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+ -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+ -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+ -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+ -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+ ${EXTERNAL_OPTIONAL_ARGS}
+ CMAKE_CACHE_ARGS
+ -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+ -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+ -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+ BUILD_COMMAND make -j8
+ INSTALL_COMMAND make install
+ DEPENDS snappy
+)
+
+add_library(snappystream STATIC IMPORTED GLOBAL)
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
+ "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+add_dependencies(snappystream extern_snappystream)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 9db457c7b2..de07703695 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index a8e1aca49c..9a9a20f897 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -52,6 +52,7 @@ ExternalProject_Add(
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
+ -DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
@@ -63,7 +64,7 @@ ExternalProject_Add(
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
-ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 1638cd8fdf..20b8506e67 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
ExternalProject_Add(
- zlib
+ extern_zlib
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/madler/zlib.git"
GIT_TAG "v1.2.8"
@@ -49,9 +49,11 @@ ExternalProject_Add(
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
+ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+ADD_DEPENDENCIES(zlib extern_zlib)
+
LIST(APPEND external_project_dependencies zlib)
-ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 66c8e3ad7e..c749c97f13 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -104,7 +104,9 @@ function(merge_static_libs TARGET_NAME)
foreach(lib ${libs})
list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
endforeach()
- list(REMOVE_DUPLICATES libs_deps)
+ if(libs_deps)
+ list(REMOVE_DUPLICATES libs_deps)
+ endif()
# To produce a library we need at least one source file.
# It is created by add_custom_command below and will helps
@@ -120,7 +122,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs})
# Generate dummy staic lib
- file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+ file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
@@ -160,7 +162,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs} ${target_OBJS})
# Generate dummy staic lib
- file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+ file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
@@ -179,15 +181,29 @@ function(cc_library TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
- if (cc_library_SRCS)
- if (cc_library_SHARED OR cc_library_shared) # build *.so
+ if(cc_library_SRCS)
+ if(cc_library_SHARED OR cc_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
else()
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+ find_fluid_modules(${TARGET_NAME})
endif()
- if (cc_library_DEPS)
+
+ if(cc_library_DEPS)
+ # Don't need link libwarpctc.so
+ if("${cc_library_DEPS};" MATCHES "warpctc;")
+ list(REMOVE_ITEM cc_library_DEPS warpctc)
+ add_dependencies(${TARGET_NAME} warpctc)
+ endif()
+ if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+ # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+ # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+ target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+ list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+ else()
+ target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+ endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
- target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
# cpplint code style
@@ -224,12 +240,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
- set(multiValueArgs SRCS DEPS)
+ set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
- target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
- add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
- add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+ target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+ if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+ list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+ endif()
+ add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+ add_test(NAME ${TARGET_NAME}
+ COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction(cc_test)
@@ -243,7 +265,8 @@ function(nv_library TARGET_NAME)
if (nv_library_SHARED OR nv_library_shared) # build *.so
cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
else()
- cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+ cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+ find_fluid_modules(${TARGET_NAME})
endif()
if (nv_library_DEPS)
add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -288,12 +311,88 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
- target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
- add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+ target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+ add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(nv_test)
+function(hip_library TARGET_NAME)
+ if (WITH_AMD_GPU)
+ set(options STATIC static SHARED shared)
+ set(oneValueArgs "")
+ set(multiValueArgs SRCS DEPS)
+ cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ set(_sources ${hip_library_SRCS})
+ HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+ if(_source_files)
+ list(REMOVE_ITEM _sources ${_source_files})
+ endif()
+ if(hip_library_SRCS)
+ if (hip_library_SHARED OR hip_library_shared) # build *.so
+ add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+ set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+ else()
+ add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+ set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+ target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+ find_fluid_modules(${TARGET_NAME})
+ endif()
+ if (hip_library_DEPS)
+ add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+ target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+ endif()
+ # cpplint code style
+ foreach(source_file ${hip_library_SRCS})
+ string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+ list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+ endif()
+ endforeach()
+ add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
+ else(hip_library_SRCS)
+ if (hip_library_DEPS)
+ merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+ else()
+ message(FATAL "Please specify source file or library in nv_library.")
+ endif()
+ endif(hip_library_SRCS)
+ endif()
+endfunction(hip_library)
+
+function(hip_binary TARGET_NAME)
+ if (WITH_AMD_GPU)
+ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs SRCS DEPS)
+ cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+ if(hip_binary_DEPS)
+ target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+ add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+ endif()
+ endif()
+endfunction(hip_binary)
+
+function(hip_test TARGET_NAME)
+ if (WITH_AMD_GPU AND WITH_TESTING)
+ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs SRCS DEPS)
+ cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ set(_sources ${hip_test_SRCS})
+ HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+ if(_source_files)
+ list(REMOVE_ITEM _sources ${_source_files})
+ endif()
+ add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+ set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+ target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+ add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+ add_test(${TARGET_NAME} ${TARGET_NAME})
+ endif()
+endfunction(hip_test)
+
function(go_library TARGET_NAME)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
@@ -324,7 +423,7 @@ function(go_library TARGET_NAME)
)
# Add dummy code to support `make target_name` under Terminal Command
- file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
if (go_library_SHARED OR go_library_shared)
add_library(${TARGET_NAME} SHARED ${dummyfile})
else()
@@ -457,12 +556,12 @@ endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
- set(options STATIC static SHARED shared)
+ set(options "")
set(oneValueArgs "")
- set(multiValueArgs SRCS DEPS ARGS)
+ set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
- COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+ COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
new file mode 100644
index 0000000000..bfe491bd6b
--- /dev/null
+++ b/cmake/hip.cmake
@@ -0,0 +1,43 @@
+if(NOT WITH_AMD_GPU)
+ return()
+endif()
+
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+
+if(WITH_DSO)
+ set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+ set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+
+if(WITH_TESTING)
+ set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+ list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+ list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
+ list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+if("x${HCC_HOME}" STREQUAL "x")
+ set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared")
+
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
new file mode 100644
index 0000000000..0323cd9698
--- /dev/null
+++ b/cmake/inference_lib.cmake
@@ -0,0 +1,113 @@
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+function(find_fluid_modules TARGET_NAME)
+ get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+ string(FIND "${__target_path}" "fluid" pos)
+ if(pos GREATER 1)
+ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+ set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+ set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+ endif()
+endfunction(find_fluid_modules)
+
+# make package for paddle fluid shared and static library
+function(copy TARGET)
+ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs SRCS DSTS DEPS)
+ cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+
+ list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+ list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+ if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+ message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
+ endif()
+ math(EXPR len "${copy_lib_SRCS_len} - 1")
+
+ add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
+ foreach(index RANGE ${len})
+ list(GET copy_lib_SRCS ${index} src)
+ list(GET copy_lib_DSTS ${index} dst)
+ add_custom_command(TARGET ${TARGET} PRE_BUILD
+ COMMAND mkdir -p "${dst}"
+ COMMAND cp -r "${src}" "${dst}"
+ COMMENT "copying ${src} -> ${dst}")
+ endforeach()
+endfunction()
+
+# third party
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+copy(eigen3_lib
+ SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+ DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+copy(gflags_lib
+ SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+ DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+copy(glog_lib
+ SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+ DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+if(NOT PROTOBUF_FOUND)
+ set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+ copy(protobuf_lib
+ SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
+ DSTS ${dst_dir} ${dst_dir}/lib
+ )
+endif()
+
+if(NOT CBLAS_FOUND)
+ set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+ copy(openblas_lib
+ SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+ DSTS ${dst_dir} ${dst_dir}
+ )
+elseif (WITH_MKLML)
+ set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+ copy(mklml_lib
+ SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+ DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+ )
+endif()
+
+# paddle fluid module
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
+set(module "framework")
+copy(framework_lib DEPS framework_py_proto
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+
+set(module "memory")
+copy(memory_lib
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+)
+
+set(module "inference")
+copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
+ SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
+set(module "platform")
+copy(platform_lib DEPS profiler_py_proto
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+)
+
+set(module "string")
+copy(string_lib
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+)
+
+add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
index a9241b0e3e..09a2ca877d 100644
--- a/cmake/make_resource.py
+++ b/cmake/make_resource.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import os
import re
import sys
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 396bd1a079..c91ef91127 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 94dd3457fb..a9b27933a5 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1,49 +1,2 @@
-if(NOT DEFINED SPHINX_THEME)
- set(SPHINX_THEME default)
-endif()
-
-if(NOT DEFINED SPHINX_THEME_DIR)
- set(SPHINX_THEME_DIR)
-endif()
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-
-configure_file(
- "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
- "${BINARY_BUILD_DIR_EN}/conf.py"
- @ONLY)
-
-sphinx_add_target(paddle_docs
- html
- ${BINARY_BUILD_DIR_EN}
- ${SPHINX_CACHE_DIR_EN}
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${SPHINX_HTML_DIR_EN})
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
-
-# HTML output directory
-set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
-
-configure_file(
- "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
- "${BINARY_BUILD_DIR_CN}/conf.py"
- @ONLY)
-
-sphinx_add_target(paddle_docs_cn
- html
- ${BINARY_BUILD_DIR_CN}
- ${SPHINX_CACHE_DIR_CN}
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${SPHINX_HTML_DIR_CN})
+add_subdirectory(v2)
+add_subdirectory(fluid)
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
deleted file mode 100644
index 9be0b370ee..0000000000
--- a/doc/api/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-API
-===
-
-.. toctree::
- :maxdepth: 1
-
- 模型配置
- 数据访问
- 训练与应用
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
deleted file mode 100644
index e6f632e1a5..0000000000
--- a/doc/api/index_en.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-API
-===
-
-.. toctree::
- :maxdepth: 1
-
- v2/model_configs.rst
- v2/data.rst
- v2/run_logic.rst
- v2/fluid.rst
diff --git a/doc/api/v1/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst
deleted file mode 100644
index d08c6b3efa..0000000000
--- a/doc/api/v1/data_provider/dataprovider_cn.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. _api_dataprovider:
-
-DataProvider的介绍
-==================
-
-DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存,让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 :ref:`api_pydataprovider2` ,来自定义传数据的过程。如果有更复杂的使用,或者需要更高的效率,用户也可以在C++端自定义一个 ``DataProvider`` 。
-
-PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider,并且在DataProvider中实现如何访问训练文件列表(train.list)或测试文件列表(test.list)。
-
-- train.list和test.list存放在本地(推荐直接存放到训练目录,以相对路径引用)。一般情况下,两者均为纯文本文件,其中每一行对应一个数据文件地址:
-
- - 如果数据文件存于本地磁盘,这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
- - 地址也可以为hdfs文件路径,或者数据库连接路径等。
- - 由于这个地址会被DataProvider使用,因此,如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
-- 如果没有设置test.list,或设置为None,那么在训练过程中不会执行测试操作;否则,会根据命令行参数指定的测试方式,在训练过程中进行测试,从而防止过拟合。
diff --git a/doc/api/v1/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst
deleted file mode 100644
index 96efbb1da9..0000000000
--- a/doc/api/v1/data_provider/dataprovider_en.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-Introduction
-==============
-DataProvider is a module that loads training or testing data into cpu or gpu
-memory for the following triaining or testing process.
-
-For simple use, users can use Python :code:`PyDataProvider` to dynamically reads
-the original data in any format or in any form, and then transfer them into a
-data format PaddlePaddle requires. The process is extremly flexible and highly
-customized, with sacrificing the efficiency only a little. This is extremly
-useful when you have to dynamically generate certain kinds of data according to,
-for example, the training performance.
-
-Besides, users also can customize a C++ :code:`DataProvider` for a more
-complex usage, or for a higher efficiency.
-
-The following parameters are required to define in the PaddlePaddle network
-configuration file (trainer_config.py): which DataProvider is chosen to used,
-and specific parameters for DataProvider, including training file list
-(train.list) and testing file list (test.list).
-
-Train.list and test.list are simply two plain text files, which defines path
-of training or testing data. It is recommended that directly placing them into
-the training directory, and reference to them by using a relative path (
-relative to the PaddePaddle program).
-
-Testing or evaluating will not be performed during training if the test.list is
-not set or set to None. Otherwise, PaddlePaddle will evaluate the trained model
-by the specified tesing data while training, every testing period (a user
-defined command line parameter in PaddlePaddle) to prevent over-fitting.
-
-Each line of train.list and test.list is an absolute or relative path (relative
-to the PaddePaddle program runtime) of data file. Fascinatingly more, each line
-can also be a HDFS file path or a SQL connection string. As long as the user
-assures how to access each file in DataProvider.
diff --git a/doc/api/v1/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst
deleted file mode 100644
index 8f9db31cfb..0000000000
--- a/doc/api/v1/data_provider/pydataprovider2_cn.rst
+++ /dev/null
@@ -1,229 +0,0 @@
-.. _api_pydataprovider2:
-
-PyDataProvider2的使用
-=====================
-
-PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据,并提供了简单的Cache功能;同时可以使用户只关注如何从文件中读取每一条数据,而不用关心数据如何传输,如何存储等等。
-
-.. contents::
-
-MNIST的使用场景
----------------
-
-我们以MNIST手写识别为例,来说明PyDataProvider2的简单使用场景。
-
-样例数据
-++++++++
-
-MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下:
-
-.. literalinclude:: src/mnist_train.txt
-
-其中每行数据代表一张图片,行内使用 ``;`` 分成两部分。第一部分是图片的标签,为0-9中的一个数字;第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字:
-
-.. literalinclude:: src/train.list
-
-dataprovider的使用
-++++++++++++++++++
-
-.. literalinclude:: src/mnist_provider.dict.py
-
-- 首先,引入PaddlePaddle的PyDataProvider2包。
-- 其次,定义一个Python的 `Decorator `_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2,同时设置它的input_types属性。
-
- - `input_types`_:设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字,显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
-
- .. literalinclude:: src/mnist_config.py
- :lines: 9-10
-
- - 注意:如果用户不显示指定返回数据的对应关系,那么PaddlePaddle会根据layer的声明顺序,来确定对应关系。但这个关系可能不正确,所以推荐使用显式指定的方式来设置input_types。
-- 最后,实现数据输入函数(如本例的 ``process`` 函数)。
-
- - 该函数的功能是:打开文本文件,读取每一行,将行中的数据转换成与input_types一致的格式,然后返回给PaddlePaddle进程。注意,
-
- - 返回的顺序需要和input_types中定义的顺序一致。
- - 返回时,必须使用Python关键词 ``yield`` ,相关概念是 ``generator`` 。
- - 一次yield调用,返回一条完整的样本。如果想为一个数据文件返回多条样本,只需要在函数中调用多次yield即可(本例中使用for循环进行多次调用)。
-
- - 该函数具有两个参数:
-
- - settings:在本例中没有使用,具体可以参考 `init_hook`_ 中的说明。
- - filename:为 ``train.list`` 或 ``test.list`` 中的一行,即若干数据文件路径的某一个。
-
-网络配置中的调用
-++++++++++++++++
-
-在网络配置里,只需要一行代码就可以调用这个PyDataProvider2,如,
-
-.. literalinclude:: src/mnist_config.py
- :lines: 1-7
-
-训练数据是 ``train.list`` ,没有测试数据,调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
-
-小结
-+++++
-
-至此,简单的PyDataProvider2样例就说明完毕了。对用户来说,仅需要知道如何从 **一个文件** 中读取 **一条样本** ,就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作:
-
-* 将数据组合成Batch进行训练
-* 对训练数据进行Shuffle
-* 多线程的数据读取
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
-
-是不是很简单呢?
-
-时序模型的使用场景
-------------------
-样例数据
-++++++++
-
-时序模型是指数据的某一维度是一个序列形式,即包含时间步信息。所谓时间步信息,不一定和时间有关系,只是说明数据的顺序是重要的。例如,文本信息就是一个序列数据。
-
-本例采用英文情感分类的数据,即将一段英文文本数据,分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下:
-
-.. literalinclude:: src/sentimental_train.txt
-
-dataprovider的使用
-++++++++++++++++++
-
-相对MNIST而言,这个dataprovider较复杂,主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的,它会在dataprovider创建的时候执行。
-
-- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列,因此使用 ``integer_value_sequence`` 类型来设置。
-- 将 ``dictionary`` 存入settings对象,在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象,即一个将单词字符串映射到单词ID的字典。
-
-.. literalinclude:: src/sentimental_provider.py
-
-网络配置中的调用
-++++++++++++++++
-
-调用这个PyDataProvider2的方法,基本上和MNIST样例一致,除了
-
-* 在配置中需要读取外部字典。
-* 在声明DataProvider的时候传入dictionary作为参数。
-
-.. literalinclude:: src/sentimental_config.py
- :emphasize-lines: 12-14
-
-参考(Reference)
----------------
-
-@provider
-+++++++++
-
-``@provider`` 是一个Python的 `Decorator`_ ,可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系,只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
-
-* input_types:数据输入格式。具体的格式说明,请参考 `input_types`_ 。
-* should_shuffle:是不是要对数据做Shuffle。训练时默认shuffle,测试时默认不shuffle。
-* min_pool_size:设置内存中最小暂存的数据条数,也是PaddlePaddle所能够保证的shuffle粒度。如果为-1,则会预先读取全部数据到内存中。
-* pool_size: 设置内存中暂存的数据条数。如果为-1(默认),则不在乎内存暂存多少条数据。如果设置,则推荐大于训练时batch size的值,并且在内存足够的情况下越大越好。
-* can_over_batch_size:是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题,一般推荐设置成True。
-* calc_batch_size:可以传入一个函数,用于自定义每条数据的batch size(默认为1)。
-* cache: 数据缓存的策略,具体请参考 `cache`_ 。
-* init_hook:初始化时调用的函数,具体请参考 `init_hook`_ 。
-* check:如果为true,会根据input_types检查数据的合法性。
-* check_fail_continue:如果为true,那么当check出数据不合法时,会扔到这条数据,继续训练或预测。(对check=false的情况,没有作用)
-
-input_types
-+++++++++++
-
-PaddlePaddle的数据包括四种主要类型,和三种序列模式。
-
-四种数据类型:
-
-* dense_vector:稠密的浮点数向量。
-* sparse_binary_vector:稀疏的01向量,即大部分值为0,但有值的地方必须为1。
-* sparse_float_vector:稀疏的向量,即大部分值为0,但有值的部分可以是任何浮点数。
-* integer:整数标签。
-
-三种序列模式:
-
-* SequenceType.NO_SEQUENCE:不是一条序列
-* SequenceType.SEQUENCE:是一条时间序列
-* SequenceType.SUB_SEQUENCE: 是一条时间序列,且序列的每一个元素还是一个时间序列。
-
-不同的数据类型和序列模式返回的格式不同,列表如下:
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| | NO_SEQUENCE | SEQUENCE | SUB_SEQUENCE |
-+======================+=====================+===================================+================================================+
-| dense_vector | [f, f, ...] | [[f, ...], [f, ...], ...] | [[[f, ...], ...], [[f, ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...] | [[i, ...], [i, ...], ...] | [[[i, ...], ...], [[i, ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value | i | [i, i, ...] | [[i, ...], [i, ...], ...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-其中,f代表一个浮点数,i代表一个整数。
-
-注意:对sparse_binary_vector和sparse_float_vector,PaddlePaddle存的是有值位置的索引。例如,
-
-- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ,类型是sparse_binary_vector,返回的是 ``[1, 2]`` 。
-- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ,类型是sparse_float_vector,返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
-
-init_hook
-+++++++++
-
-init_hook可以传入一个函数。该函数在初始化的时候会被调用,其参数如下:
-
-* 第一个参数是settings对象,它和数据传入函数的第一个参数(如本例中 ``process`` 函数的 ``settings`` 参数)必须一致。该对象具有以下两个属性:
- * settings.input_types:数据输入格式,具体请参考 `input_types`_ 。
- * settings.logger:一个logging对象。
-* 其他参数使用 ``kwargs`` (key word arguments)传入,包括以下两种:
- * PaddlePaddle定义的参数: 1)is_train:bool型参数,表示用于训练或预测;2)file_list:所有文件列表。
- * 用户定义的参数:使用args在网络配置中设置。
-
-注意:PaddlePaddle保留添加参数的权力,因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
-
-cache
-+++++
-
-PyDataProvider2提供了两种简单的Cache策略:
-
-* CacheType.NO_CACHE:不缓存任何数据,每次都会从python端读取数据
-* CacheType.CACHE_PASS_IN_MEM:第一个pass会从python端读取数据,剩下的pass会直接从内存里
- 读取数据。
-
-
-注意事项
---------
-
-可能的内存泄露问题
-++++++++++++++++++
-
-PaddlePaddle将train.list中的每一行都传递给process函数,从而生成多个generator。当训练数据非常多时,就会生成非常多的generator。
-
-虽然每个generator在没有调用的时候,是几乎不占内存的;但当调用过一次后,generator便会存下当前的上下文(Context),而这个Context可能会非常大。并且,generator至少需要调用两次才会知道是否停止。所以,即使process函数里面只有一个yield,也需要两次随机选择到相同generator的时候,才会释放该段内存。
-
-.. code-block:: python
-
- def func():
- yield 0
-
- f = func() # 创建generator
- tmp = next(f) # 调用一次,返回0
- tmp = next(f) # 调用第二次的时候,才会Stop Iteration
-
-由于顺序调用这些generator不会出现上述问题,因此有两种解决方案:
-
-1. **最佳推荐**:将样本的地址放入另一个文本文件,train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
-2. 在generator的上下文中尽量留下非常少的变量引用,例如
-
-.. code-block:: python
-
- def real_process(fn):
- # ... read from fn
- return result # 当函数返回的时候,python可以解除掉内部变量的引用。
-
- def process(fn):
- yield real_process(fn)
-
-注意:这个问题是PyDataProvider读数据时候的逻辑问题,很难整体修正。
-
-内存不够用的情况
-++++++++++++++++
-
-PyDataProvider2会尽可能多的使用内存。因此,对于内存较小的机器,推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
-
diff --git a/doc/api/v1/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst
deleted file mode 100644
index e8fb629277..0000000000
--- a/doc/api/v1/data_provider/pydataprovider2_en.rst
+++ /dev/null
@@ -1,249 +0,0 @@
-.. _api_pydataprovider2:
-
-PyDataProvider2
-===============
-
-We highly recommand users to use PyDataProvider2 to provide training or testing
-data to PaddlePaddle. The user only needs to focus on how to read a single
-sample from the original data file by using PyDataProvider2, leaving all of the
-trivial work, including, transfering data into cpu/gpu memory, shuffle, binary
-serialization to PyDataProvider2. PyDataProvider2 uses multithreading and a
-fanscinating but simple cache strategy to optimize the efficiency of the data
-providing process.
-
-DataProvider for the non-sequential model
------------------------------------------
-
-Here we use the MNIST handwriting recognition data as an example to illustrate
-how to write a simple PyDataProvider.
-
-MNIST is a handwriting classification data set. It contains 70,000 digital
-grayscale images. Labels of the training sample range from 0 to 9. All the
-images have been size-normalized and centered into images with the same size
-of 28 x 28 pixels.
-
-A small part of the original data as an example is shown as below:
-
-.. literalinclude:: src/mnist_train.txt
-
-Each line of the data contains two parts, separated by :code:`;`. The first part is
-label of an image. The second part contains 28x28 pixel float values.
-
-Just write path of the above data into train.list. It looks like this:
-
-.. literalinclude:: src/train.list
-
-The corresponding dataprovider is shown as below:
-
-.. literalinclude:: src/mnist_provider.dict.py
-
-The first line imports PyDataProvider2 package.
-The main function is the process function, that has two parameters.
-The first parameter is the settings, which is not used in this example.
-The second parameter is the filename, that is exactly each line of train.list.
-This parameter is passed to the process function by PaddlePaddle.
-
-:code:`@provider` is a Python
-`Decorator `_ .
-It sets some properties to DataProvider, and constructs a real PaddlePaddle
-DataProvider from a very simple user implemented python function. It does not
-matter if you are not familiar with `Decorator`_. You can keep it simple by
-just taking :code:`@provider` as a fixed mark above the provider function you
-implemented.
-
-`input_types`_ defines the data format that a DataProvider returns.
-In this example, it is set to a 28x28-dimensional dense vector and an integer
-scalar, whose value ranges from 0 to 9.
-`input_types`_ can be set to several kinds of input formats, please refer to the
-document of `input_types`_ for more details.
-
-
-The process method is the core part to construct a real DataProvider in
-PaddlePaddle. It implements how to open the text file, how to read one sample
-from the original text file, convert them into `input_types`_, and give them
-back to PaddlePaddle process at line 23.
-Note that data yielded by the process function must follow the same order that
-`input_types`_ are defined.
-
-
-With the help of PyDataProvider2, user can focus on how to generate ONE traning
-sample by using keywords :code:`yield`.
-:code:`yield` is a python keyword, and a concept related to it includes
-:code:`generator`.
-
-Only a few lines of codes need to be added into the training configuration file,
-you can take this as an example.
-
-.. literalinclude:: src/mnist_config.py
-
-Here we specify training data by :code:`train.list`, and no testing data is specified.
-The method which actually provide data is :code:`process`.
-
-User also can use another style to provide data, which defines the
-:code:`data_layer`'s name explicitly when `yield`. For example,
-the :code:`dataprovider` is shown as below.
-
-.. literalinclude:: src/mnist_provider.dict.py
- :linenos:
-
-If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
-the order of :code:`data_layer` definition roughly to determine which feature to
-which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
-:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.
-
-Now, this simple example of using PyDataProvider is finished.
-The only thing that the user should know is how to generte **one sample** from
-**one data file**.
-And PaddlePadle will do all of the rest things\:
-
-* Form a training batch
-* Shuffle the training data
-* Read data with multithreading
-* Cache the training data (Optional)
-* CPU-> GPU double buffering.
-
-Is this cool?
-
-.. _api_pydataprovider2_sequential_model:
-
-DataProvider for the sequential model
--------------------------------------
-A sequence model takes sequences as its input. A sequence is made up of several
-timesteps. The so-called timestep, is not necessary to have something to do
-with time. It can also be explained to that the order of data are taken into
-consideration into model design and training.
-For example, the sentence can be interpreted as a kind of sequence data in NLP
-tasks.
-
-Here is an example on data proivider for English sentiment classification data.
-The original input data are simple English text, labeled into positive or
-negative sentiment (marked by 0 and 1 respectively).
-
-A small part of the original data as an example can be found in the path below:
-
-.. literalinclude:: src/sentimental_train.txt
-
-The corresponding data provider can be found in the path below:
-
-.. literalinclude:: src/sentimental_provider.py
-
-This data provider for sequential model is a little more complex than that
-for MINST dataset.
-A new initialization method is introduced here.
-The method :code:`on_init` is configured to DataProvider by :code:`@provider`'s
-:code:`init_hook` parameter, and it will be invoked once DataProvider is
-initialized. The :code:`on_init` function has the following parameters:
-
-* The first parameter is the settings object.
-* The rest parameters are passed by key word arguments. Some of them are passed
- by PaddlePaddle, see reference for `init_hook`_.
- The :code:`dictionary` object is a python dict object passed from the trainer
- configuration file, and it maps word string to word id.
-
-To pass these parameters into DataProvider, the following lines should be added
-into trainer configuration file.
-
-.. literalinclude:: src/sentimental_config.py
-
-The definition is basically same as MNIST example, except:
-* Load dictionary in this configuration
-* Pass it as a parameter to the DataProvider
-
-The `input_types` is configured in method :code:`on_init`. It has the same
-effect to configure them by :code:`@provider`'s :code:`input_types` parameter.
-However, the :code:`input_types` is set at runtime, so we can set it to
-different types according to the input data. Input of the neural network is a
-sequence of word id, so set :code:`seq_type` to :code:`integer_value_sequence`.
-
-Durning :code:`on_init`, we save :code:`dictionary` variable to
-:code:`settings`, and it will be used in :code:`process`. Note the settings
-parameter for the process function and for the on_init's function are a same
-object.
-
-The basic processing logic is the same as MNIST's :code:`process` method. Each
-sample in the data file is given back to PaddlePaddle process.
-
-Thus, the basic usage of PyDataProvider is here.
-Please refer to the following section reference for details.
-
-Reference
----------
-
-@provider
-+++++++++
-
-.. autofunction:: paddle.trainer.PyDataProvider2.provider
-
-input_types
-+++++++++++
-
-PaddlePaddle has four data types, and three sequence types.
-The four data types are:
-
-* :code:`dense_vector`: dense float vector.
-* :code:`sparse_binary_vector`: sparse binary vector, most of the value is 0, and
- the non zero elements are fixed to 1.
-* :code:`sparse_float_vector`: sparse float vector, most of the value is 0, and some
- non zero elements can be any float value. They are given by the user.
-* :code:`integer`: an integer scalar, that is especially used for label or word index.
-
-The three sequence types are:
-
-* :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
-* :code:`SequenceType.SEQUENCE` means the sample is a sequence.
-* :code:`SequenceType.SUB_SEQUENCE` means it is a nested sequence, that each timestep of
- the input sequence is also a sequence.
-
-Different input type has a defferenct input format. Their formats are shown
-in the above table.
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| | NO_SEQUENCE | SEQUENCE | SUB_SEQUENCE |
-+======================+=====================+===================================+================================================+
-| dense_vector | [f, f, ...] | [[f, ...], [f, ...], ...] | [[[f, ...], ...], [[f, ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...] | [[i, ...], [i, ...], ...] | [[[i, ...], ...], [[i, ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value | i | [i, i, ...] | [[i, ...], [i, ...], ...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-where f represents a float value, i represents an integer value.
-
-init_hook
-+++++++++
-
-init_hook is a function that is invoked once the data provoder is initialized.
-Its parameters lists as follows:
-
-* The first parameter is a settings object, which is the same to :code:`settings`
- in :code:`process` method. The object contains several attributes, including:
-
- * :code:`settings.input_types`: the input types. Reference `input_types`_.
- * :code:`settings.logger`: a logging object.
-
-* The rest parameters are the key word arguments. It is made up of PaddpePaddle
- pre-defined parameters and user defined parameters.
-
- * PaddlePaddle-defined parameters including:
-
- * :code:`is_train` is a bool parameter that indicates the DataProvider is used in
- training or testing.
- * :code:`file_list` is the list of all files.
-
- * User-defined parameters args can be set in training configuration.
-
-Note, PaddlePaddle reserves the right to add pre-defined parameter, so please
-use :code:`**kwargs` in init_hook to ensure compatibility by accepting the
-parameters which your init_hook does not use.
-
-cache
-+++++
-DataProvider provides two simple cache strategy. They are:
-
-* :code:`CacheType.NO_CACHE` means do not cache any data, then data is read at runtime by
- the user implemented python module every pass.
-* :code:`CacheType.CACHE_PASS_IN_MEM` means the first pass reads data by the user
- implemented python module, and the rest passes will directly read data from
- memory.
diff --git a/doc/api/v1/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py
deleted file mode 100644
index 429338c57f..0000000000
--- a/doc/api/v1/data_provider/src/mnist_config.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from paddle.trainer_config_helpers import *
-
-define_py_data_sources2(
- train_list='train.list',
- test_list=None,
- module='mnist_provider',
- obj='process')
-
-img = data_layer(name='pixel', size=784)
-label = data_layer(name='label', size=10)
diff --git a/doc/api/v1/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py
deleted file mode 100644
index 2ba0b126a0..0000000000
--- a/doc/api/v1/data_provider/src/mnist_provider.dict.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from paddle.trainer.PyDataProvider2 import *
-
-
-# Define a py data provider
-@provider(
- input_types={'pixel': dense_vector(28 * 28),
- 'label': integer_value(10)})
-def process(settings, filename): # settings is not used currently.
- f = open(filename, 'r') # open one of training file
-
- for line in f: # read each line
- label, pixel = line.split(';')
-
- # get features and label
- pixels_str = pixel.split(' ')
-
- pixels_float = []
- for each_pixel_str in pixels_str:
- pixels_float.append(float(each_pixel_str))
-
- # give data to paddle.
- yield {"pixel": pixels_float, 'label': int(label)}
-
- f.close() # close file
diff --git a/doc/api/v1/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt
deleted file mode 100644
index 34be718ad9..0000000000
--- a/doc/api/v1/data_provider/src/mnist_train.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-5;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.215686 0.533333 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.67451 0.992157 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.070588 0.886275 0.992157 0 0 0 0 0 0 0 0 0 0 0.192157 0.070588 0 0 0 0 0 0 0 0 0 0 0 0 0 0.670588 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0.117647 0.933333 0.858824 0.313725 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.858824 0.992157 0.831373 0 0 0 0 0 0 0 0 0 0.141176 0.992157 0.992157 0.611765 0.054902 0 0 0 0 0 0 0 0 0 0 0.258824 0.992157 0.992157 0.529412 0 0 0 0 0 0 0 0 0 0.368627 0.992157 0.992157 0.419608 0.003922 0 0 0 0 0 0 0 0 0 0.094118 0.835294 0.992157 0.992157 0.517647 0 0 0 0 0 0 0 0 0 0.603922 0.992157 0.992157 0.992157 0.603922 0.545098 0.043137 0 0 0 0 0 0 0 0.447059 0.992157 0.992157 0.956863 0.062745 0 0 0 0 0 0 0 0 0.011765 0.666667 0.992157 0.992157 0.992157 0.992157 0.992157 0.745098 0.137255 0 0 0 0 0 0.152941 0.866667 0.992157 0.992157 0.521569 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.992157 0.803922 0.352941 0.745098 0.992157 0.945098 0.317647 0 0 0 0 0.580392 0.992157 0.992157 0.764706 0.043137 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.776471 0.043137 0 0.007843 0.27451 0.882353 0.941176 0.176471 0 0 0.180392 0.898039 0.992157 0.992157 0.313725 0 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.713725 0 0 0 0 0.627451 0.992157 0.729412 0.062745 0 0.509804 0.992157 0.992157 0.776471 0.035294 0 0 0 0 0 0 0 0 0 0 0.494118 0.992157 0.992157 0.968627 0.168627 0 0 0 0.423529 0.992157 0.992157 0.364706 0 0.717647 0.992157 0.992157 0.317647 0 0 0 0 0 0 0 0 0 0 0 0.533333 0.992157 0.984314 0.945098 0.603922 0 0 0 0.003922 0.466667 0.992157 0.988235 0.976471 0.992157 0.992157 0.788235 0.007843 0 0 0 0 0 0 0 0 0 0 0 0.686275 0.882353 0.364706 0 0 0 0 0 0 0.098039 0.588235 0.992157 0.992157 0.992157 0.980392 0.305882 0 0 0 0 0 0 0 0 0 0 0 0 0.101961 0.67451 0.321569 0 0 0 0 0 0 0 0.105882 0.733333 0.976471 0.811765 0.713725 0 0 0 0 0 0 0 0 0 0 0 0 0 0.65098 0.992157 0.321569 0 0 0 0 0 0 0 0 0 0.25098 0.007843 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0.94902 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.968627 0.764706 0.152941 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.498039 0.25098 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
-0;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.298039 0.333333 0.333333 0.333333 0.337255 0.333333 0.333333 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.027451 0.223529 0.776471 0.964706 0.988235 0.988235 0.988235 0.992157 0.988235 0.988235 0.780392 0.098039 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.14902 0.698039 0.988235 0.992157 0.988235 0.901961 0.87451 0.568627 0.882353 0.976471 0.988235 0.988235 0.501961 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.188235 0.647059 0.988235 0.988235 0.745098 0.439216 0.098039 0 0 0 0.572549 0.988235 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.941176 0.247059 0 0 0 0 0 0 0.188235 0.898039 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0 0 0.039216 0.639216 0.933333 0.988235 0.913725 0.278431 0 0 0 0 0 0 0 0.113725 0.843137 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0.235294 0.988235 0.992157 0.988235 0.815686 0.07451 0 0 0 0 0 0 0 0.333333 0.988235 0.988235 0.552941 0 0 0 0 0 0 0 0 0 0 0.211765 0.878431 0.988235 0.992157 0.701961 0.329412 0.109804 0 0 0 0 0 0 0 0.698039 0.988235 0.913725 0.145098 0 0 0 0 0 0 0 0 0 0.188235 0.890196 0.988235 0.988235 0.745098 0.047059 0 0 0 0 0 0 0 0 0 0.882353 0.988235 0.568627 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.992157 0.992157 0.447059 0.294118 0 0 0 0 0 0 0 0 0.447059 0.992157 0.768627 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.988235 0.988235 0.988235 0.992157 0.47451 0 0 0 0 0 0 0 0.188235 0.933333 0.87451 0.509804 0 0 0 0 0 0 0 0 0 0 0.992157 0.988235 0.937255 0.792157 0.988235 0.894118 0.082353 0 0 0 0 0 0 0.027451 0.647059 0.992157 0.654902 0 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.913725 0.329412 0.376471 0.184314 0 0 0 0 0 0 0.027451 0.513725 0.988235 0.635294 0.219608 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.929412 0.988235 0.988235 0.741176 0.309804 0 0 0 0 0 0 0.529412 0.988235 0.678431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.223529 0.992157 0.992157 1 0.992157 0.992157 0.992157 0.992157 1 0.992157 0.992157 0.882353 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.023529 0.478431 0.654902 0.658824 0.952941 0.988235 0.988235 0.988235 0.992157 0.988235 0.729412 0.278431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.647059 0.764706 0.764706 0.768627 0.580392 0.047059 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
-4;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.180392 0.470588 0.623529 0.623529 0.623529 0.588235 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.243137 0.494118 0.862745 0.870588 0.960784 0.996078 0.996078 0.996078 0.996078 0.992157 0.466667 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.317647 0.639216 0.639216 0.639216 0.639216 0.639216 0.470588 0.262745 0.333333 0.929412 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.184314 0.992157 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.192157 0.996078 0.384314 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.454902 0.980392 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.564706 0.941176 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.588235 0.776471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.945098 0.560784 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.054902 0.952941 0.356863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.337255 0.917647 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.698039 0.701961 0.019608 0.4 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.376471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.639216 0.972549 0.945098 0.913725 0.996078 0.996078 0.996078 0.996078 1 0.996078 0.996078 1 0.996078 0 0 0 0 0 0 0 0 0 0 0.007843 0.105882 0.717647 0.776471 0.905882 0.996078 0.996078 0.988235 0.980392 0.862745 0.537255 0.223529 0.223529 0.368627 0.376471 0.6 0.6 0.6 0 0 0 0 0 0 0 0 0.262745 0.470588 0.6 0.996078 0.996078 0.996078 0.996078 0.847059 0.356863 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.909804 0.705882 0.823529 0.635294 0.490196 0.219608 0.113725 0.062745 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.152941 0.152941 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
diff --git a/doc/api/v1/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py
deleted file mode 100644
index 7ce71608a2..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_config.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from paddle.trainer_config_helpers import *
-
-dictionary = dict()
-... # read dictionary from outside
-
-define_py_data_sources2(
- train_list='train.list',
- test_list=None,
- module='sentimental_provider',
- obj='process',
- # above codes same as mnist sample.
- args={ # pass to provider.
- 'dictionary': dictionary
- })
diff --git a/doc/api/v1/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py
deleted file mode 100644
index 14bd0e05a9..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_provider.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from paddle.trainer.PyDataProvider2 import *
-
-
-def on_init(settings, dictionary, **kwargs):
- # on_init will invoke when data provider is initialized. The dictionary
- # is passed from trainer_config, and is a dict object with type
- # (word string => word id).
-
- # set input types in runtime. It will do the same thing as
- # @provider(input_types) will do, but it is set dynamically during runtime.
- settings.input_types = {
- # The text is a sequence of integer values, and each value is a word id.
- # The whole sequence is the sentences that we want to predict its
- # sentimental.
- 'data': integer_value_sequence(len(dictionary)), # text input
- 'label': integer_value(2) # label positive/negative
- }
-
- # save dictionary as settings.dictionary.
- # It will be used in process method.
- settings.dictionary = dictionary
-
-
-@provider(init_hook=on_init)
-def process(settings, filename):
- f = open(filename, 'r')
-
- for line in f: # read each line of file
- label, sentence = line.split('\t') # get label and sentence
- words = sentence.split(' ') # get words
-
- # convert word string to word id
- # the word not in dictionary will be ignored.
- word_ids = []
-
- for each_word in words:
- if each_word in settings.dictionary:
- word_ids.append(settings.dictionary[each_word])
-
- # give data to paddle.
- yield word_ids, int(label)
-
- f.close()
diff --git a/doc/api/v1/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt
deleted file mode 100644
index 0060ac267c..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_train.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-0 I saw this movie at the AFI Dallas festival . It all takes place at a lake house and it looks wonderful .
-1 This documentary makes you travel all around the globe . It contains rare and stunning sequels from the wilderness .
-...
diff --git a/doc/api/v1/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list
deleted file mode 100644
index 92bdc0a8b4..0000000000
--- a/doc/api/v1/data_provider/src/train.list
+++ /dev/null
@@ -1 +0,0 @@
-mnist_train.txt
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
deleted file mode 100644
index cf146dc088..0000000000
--- a/doc/api/v1/index_cn.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-API中文手册
-============
-
-DataProvider API
-----------------
-
-.. toctree::
- :maxdepth: 1
-
- data_provider/dataprovider_cn.rst
- data_provider/pydataprovider2_cn.rst
-
-.. _api_trainer_config:
-
-Model Config API
-----------------
-
-.. toctree::
- :maxdepth: 1
-
- trainer_config_helpers/optimizers.rst
- trainer_config_helpers/data_sources.rst
- trainer_config_helpers/layers.rst
- trainer_config_helpers/activations.rst
- trainer_config_helpers/poolings.rst
- trainer_config_helpers/networks.rst
- trainer_config_helpers/evaluators.rst
- trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
-.. toctree::
- :maxdepth: 1
-
- predict/swig_py_paddle_cn.rst
diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst
deleted file mode 100644
index 10c297a71d..0000000000
--- a/doc/api/v1/index_en.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-API
-===
-
-DataProvider API
-----------------
-
-.. toctree::
- :maxdepth: 1
-
- data_provider/dataprovider_en.rst
- data_provider/pydataprovider2_en.rst
-
-.. _api_trainer_config:
-
-Model Config API
-----------------
-
-.. toctree::
- :maxdepth: 1
-
- trainer_config_helpers/optimizers.rst
- trainer_config_helpers/data_sources.rst
- trainer_config_helpers/layers.rst
- trainer_config_helpers/activations.rst
- trainer_config_helpers/poolings.rst
- trainer_config_helpers/networks.rst
- trainer_config_helpers/evaluators.rst
- trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
-.. toctree::
- :maxdepth: 1
-
- predict/swig_py_paddle_en.rst
diff --git a/doc/api/v1/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py
deleted file mode 100644
index 51349250e8..0000000000
--- a/doc/api/v1/predict/src/predict_sample.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-TEST_DATA = [[[
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0.992157, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0.070588, 0.886275, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157,
- 0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.670588, 0.992157,
- 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0.992157, 0.992157, 0.611765, 0.054902, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.529412, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0.992157, 0.603922,
- 0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
- 0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157,
- 0.992157, 0.992157, 0.992157, 0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0,
- 0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098,
- 0.992157, 0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157,
- 0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157,
- 0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0,
- 0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0.070588, 0.992157, 0.992157, 0.713725, 0, 0, 0, 0, 0.627451,
- 0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.776471,
- 0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157,
- 0.968627, 0.168627, 0, 0, 0, 0.423529, 0.992157, 0.992157, 0.364706, 0,
- 0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922,
- 0.466667, 0.992157, 0.988235, 0.976471, 0.992157, 0.992157, 0.788235,
- 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.882353, 0.364706, 0,
- 0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
- 0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569,
- 0, 0, 0, 0, 0, 0, 0, 0.105882, 0.733333, 0.976471, 0.811765, 0.713725, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0.321569, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
- 0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0.968627, 0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.25098, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0
-]], [[
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255,
- 0.333333, 0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0.027451, 0.223529, 0.776471, 0.964706, 0.988235, 0.988235, 0.988235,
- 0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961,
- 0.87451, 0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.647059, 0.988235, 0.988235,
- 0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0.988235,
- 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157,
- 0.941176, 0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157,
- 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.039216, 0.639216, 0.933333,
- 0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
- 0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235,
- 0.992157, 0.988235, 0.815686, 0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333,
- 0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211765,
- 0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0,
- 0, 0, 0.698039, 0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0.188235, 0.890196, 0.988235, 0.988235, 0.745098, 0.047059, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2,
- 0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0,
- 0, 0, 0, 0.447059, 0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0.992157, 0.47451, 0, 0,
- 0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118,
- 0.082353, 0, 0, 0, 0, 0, 0, 0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0.329412, 0.376471,
- 0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294,
- 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235,
- 0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.529412, 0.988235,
- 0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
- 0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157,
- 0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529,
- 0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.988235,
- 0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627,
- 0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0
-]]]
-
-
-def main():
- conf = parse_config("./mnist_model/trainer_config.py", "")
- print conf.data_config.load_data_args
- network = swig_paddle.GradientMachine.createFromConfigProto(
- conf.model_config)
- assert isinstance(network, swig_paddle.GradientMachine) # For code hint.
- network.loadParameters("./mnist_model/")
- converter = DataProviderConverter([dense_vector(784)])
- inArg = converter(TEST_DATA)
- print network.forwardTest(inArg)
-
-
-if __name__ == '__main__':
- swig_paddle.initPaddle("--use_gpu=0")
- main()
diff --git a/doc/api/v1/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst
deleted file mode 100644
index 42f333dba2..0000000000
--- a/doc/api/v1/predict/swig_py_paddle_cn.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-.. _api_swig_py_paddle:
-
-基于Python的预测
-================
-
-预测流程
---------
-
-PaddlePaddle使用swig对常用的预测接口进行了封装,通过编译会生成py_paddle软件包,安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。
-
-基于Python的模型预测,主要包括以下五个步骤。
-
-1. 初始化PaddlePaddle环境
-
- 在程序开始阶段,通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。
-
-2. 解析模型配置文件
-
- 初始化之后,可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer,所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。
-
-3. 构造paddle.GradientMachine
-
- 通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。
-
-4. 准备预测数据
-
- swig_paddle中的预测接口的参数是自定义的C++数据类型,py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。
-
-5. 模型预测
-
- 通过调用 ``forwardTest()`` 传入预测数据,直接返回计算结果。
-
-
-预测Demo
---------
-
-如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
-
-.. literalinclude:: src/predict_sample.py
- :language: python
- :lines: 15-18,121-136
-
-
-Demo预测输出如下,其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据,所以输出的value包含两个向量 。
-
-.. code-block:: text
-
- [{'id': None, 'value': array(
- [[ 5.53018653e-09, 1.12194102e-05, 1.96644767e-09,
- 1.43630644e-02, 1.51111044e-13, 9.85625684e-01,
- 2.08823112e-10, 2.32777140e-08, 2.00186201e-09,
- 1.15501715e-08],
- [ 9.99982715e-01, 1.27787406e-10, 1.72296313e-05,
- 1.49316648e-09, 1.36540484e-11, 6.93137714e-10,
- 2.70634608e-08, 3.48565123e-08, 5.25639710e-09,
- 4.48684503e-08]], dtype=float32)}]
-
-
diff --git a/doc/api/v1/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst
deleted file mode 100644
index 1c628e6971..0000000000
--- a/doc/api/v1/predict/swig_py_paddle_en.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-Python Prediction
-==================
-
-PaddlePaddle offers a set of clean prediction interfaces for python with the help of
-SWIG. The main steps of predict values in python are:
-
-* Parse training configurations
-* Construct GradientMachine
-* Prepare data
-* Predict
-
-Here is a sample python script that shows the typical prediction process for the
-MNIST classification problem. A complete sample code could be found at
-:code:`src_root/doc/ui/predict/predict_sample.py`.
-
-.. literalinclude:: src/predict_sample.py
- :language: python
- :lines: 15-18,90-100,101-104
-
-The module that does the most of the job is py_paddle.swig_paddle, it's
-generated by SWIG and has complete documents, for more details you can use
-python's :code:`help()` function. Let's walk through the above python script:
-
-* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
- PaddlePaddle with command line arguments, for more about command line arguments
- see :ref:`cmd_detail_introduction` .
-* Parse the configuration file that is used in training with :code:`parse_config()`.
- Because data to predict with always have no label, and output of prediction work
- normally is the output layer rather than the cost layer, so you should modify
- the configuration file accordingly before using it in the prediction work.
-* Create a neural network with
- :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the
- parsed configuration :code:`conf.model_config` as argument. Then load the
- trained parameters from the model with :code:`network.loadParameters()`.
-* Create a data converter object of utility class :code:`DataProviderConverter`.
- - Note: As swig_paddle can only accept C++ matrices, we offer a utility
- class DataProviderConverter that can accept the same input data with
- PyDataProvider2, for more information please refer to document
- of :ref:`api_pydataprovider2` .
-* Do the prediction with :code:`forwardTest()`, which takes the converted
- input data and outputs the activations of the output layer.
-
-Here is a typical output:
-
-.. code-block:: text
-
- [{'id': None, 'value': array([[ 5.53018653e-09, 1.12194102e-05, 1.96644767e-09,
- 1.43630644e-02, 1.51111044e-13, 9.85625684e-01,
- 2.08823112e-10, 2.32777140e-08, 2.00186201e-09,
- 1.15501715e-08],
- [ 9.99982715e-01, 1.27787406e-10, 1.72296313e-05,
- 1.49316648e-09, 1.36540484e-11, 6.93137714e-10,
- 2.70634608e-08, 3.48565123e-08, 5.25639710e-09,
- 4.48684503e-08]], dtype=float32)}]
-
-:code:`value` is the output of the output layer, each row represents result of
-the corresponding row in the input data, each element represents activation of
-the corresponding neuron in the output layer.
-
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
deleted file mode 100644
index 43fc19dc49..0000000000
--- a/doc/api/v2/fluid.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-======================
-Fluid
-======================
-
-.. toctree::
- :maxdepth: 1
-
- fluid/layers.rst
- fluid/data_feeder.rst
- fluid/executor.rst
- fluid/initializer.rst
- fluid/evaluator.rst
- fluid/nets.rst
- fluid/optimizer.rst
- fluid/param_attr.rst
- fluid/profiler.rst
- fluid/regularizer.rst
-
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
deleted file mode 100644
index 0fa78f7dfb..0000000000
--- a/doc/api/v2/fluid/data_feeder.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-===========
-DataFeeder
-===========
-
-DataFeeder
------------
-.. automodule:: paddle.v2.fluid.data_feeder
- :members: DataFeeder
- :noindex:
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
deleted file mode 100644
index a23f3301d0..0000000000
--- a/doc/api/v2/fluid/evaluator.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-===========
-Evaluator
-===========
-
-Evaluator
------------
-.. automodule:: paddle.v2.fluid.evaluator
- :members: Evaluator
- :noindex:
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
deleted file mode 100644
index 3a283538c1..0000000000
--- a/doc/api/v2/fluid/executor.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-===========
-Executor
-===========
-
-Executor
------------
-.. automodule:: paddle.v2.fluid.executor
- :members: Executor
- :noindex:
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
deleted file mode 100644
index 8f587837e9..0000000000
--- a/doc/api/v2/fluid/initializer.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-===========
-Initializer
-===========
-
-
-
-Initializer
------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: Initializer
- :noindex:
-
-
-
-ConstantInitializer
--------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: ConstantInitializer
- :noindex:
-
-
-
-UniformInitializer
-------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: UniformInitializer
- :noindex:
-
-
-
-NormalInitializer
------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: NormalInitializer
- :noindex:
-
-
-XavierInitializer
------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: XavierInitializer
- :noindex:
-
-
-MSRAInitializer
----------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: MSRAInitializer
- :noindex:
-
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
deleted file mode 100644
index 89e5fec13b..0000000000
--- a/doc/api/v2/fluid/layers.rst
+++ /dev/null
@@ -1,302 +0,0 @@
-==========
-Layers
-==========
-
-
-fc
----
-.. autofunction:: paddle.v2.fluid.layers.fc
- :noindex:
-
-embedding
----------
-.. autofunction:: paddle.v2.fluid.layers.embedding
- :noindex:
-
-dynamic_lstm
-------------
-.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
- :noindex:
-
-data
----------
-.. autofunction:: paddle.v2.fluid.layers.data
- :noindex:
-
-mean
----------
-.. autofunction:: paddle.v2.fluid.layers.mean
- :noindex:
-
-mul
----------
-.. autofunction:: paddle.v2.fluid.layers.mul
- :noindex:
-
-elementwise_add
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_add
- :noindex:
-
-elementwise_div
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_div
- :noindex:
-
-
-dropout
----------
-.. autofunction:: paddle.v2.fluid.layers.dropout
- :noindex:
-
-
-reshape
----------
-.. autofunction:: paddle.v2.fluid.layers.reshape
- :noindex:
-
-
-sigmoid
----------
-.. autofunction:: paddle.v2.fluid.layers.sigmoid
- :noindex:
-
-
-scale
----------
-.. autofunction:: paddle.v2.fluid.layers.scale
- :noindex:
-
-
-reshape
----------
-.. autofunction:: paddle.v2.fluid.layers.reshape
- :noindex:
-
-
-transpose
----------
-.. autofunction:: paddle.v2.fluid.layers.transpose
- :noindex:
-
-
-sigmoid_cross_entropy_with_logits
----------
-.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
- :noindex:
-
-
-cast
----------
-.. autofunction:: paddle.v2.fluid.layers.cast
- :noindex:
-
-
-concat
----------
-.. autofunction:: paddle.v2.fluid.layers.concat
- :noindex:
-
-
-sums
----------
-.. autofunction:: paddle.v2.fluid.layers.sums
- :noindex:
-
-
-linear_chain_crf
----------
-.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
- :noindex:
-
-
-assign
----------
-.. autofunction:: paddle.v2.fluid.layers.embedding
- :noindex:
-
-
-split_lod_tensor
----------
-.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
- :noindex:
-
-
-merge_lod_tensor
----------
-.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
- :noindex:
-
-cos_sim
----------
-.. autofunction:: paddle.v2.fluid.layers.cos_sim
- :noindex:
-
-
-cross_entropy
----------
-.. autofunction:: paddle.v2.fluid.layers.cross_entropy
- :noindex:
-
-
-
-square_error_cost
----------
-.. autofunction:: paddle.v2.fluid.layers.square_error_cost
- :noindex:
-
-
-accuracy
----------
-.. autofunction:: paddle.v2.fluid.layers.accuracy
- :noindex:
-
-
-sequence_conv
----------
-.. autofunction:: paddle.v2.fluid.layers.sequence_conv
- :noindex:
-
-
-conv2d
----------
-.. autofunction:: paddle.v2.fluid.layers.conv2d
- :noindex:
-
-
-sequence_pool
----------
-.. autofunction:: paddle.v2.fluid.layers.sequence_pool
- :noindex:
-
-
-pool2d
----------
-.. autofunction:: paddle.v2.fluid.layers.pool2d
- :noindex:
-
-
-batch_norm
----------
-.. autofunction:: paddle.v2.fluid.layers.batch_norm
- :noindex:
-
-
-beam_search_decode
----------
-.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
- :noindex:
-
-
-lstm
----------
-.. autofunction:: paddle.v2.fluid.layers.lstm
- :noindex:
-
-
-lod_rank_table
----------
-.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
- :noindex:
-
-
-max_sequence_len
----------
-.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
- :noindex:
-
-
-topk
----------
-.. autofunction:: paddle.v2.fluid.layers.topk
- :noindex:
-
-
-lod_tensor_to_array
----------
-.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
- :noindex:
-
-
-
-array_to_lod_tensor
----------
-.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
- :noindex:
-
-
-
-
-fill_constant
----------
-.. autofunction:: paddle.v2.fluid.layers.fill_constant
- :noindex:
-
-
-
-fill_constant_batch_size_like
----------
-.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
- :noindex:
-
-
-ones
----------
-.. autofunction:: paddle.v2.fluid.layers.ones
- :noindex:
-
-
-zeros
----------
-.. autofunction:: paddle.v2.fluid.layers.zeros
- :noindex:
-
-
-increment
----------
-.. autofunction:: paddle.v2.fluid.layers.increment
- :noindex:
-
-
-array_write
----------
-.. autofunction:: paddle.v2.fluid.layers.array_write
- :noindex:
-
-
-
-create_array
----------
-.. autofunction:: paddle.v2.fluid.layers.create_array
- :noindex:
-
-
-less_than
----------
-.. autofunction:: paddle.v2.fluid.layers.less_than
- :noindex:
-
-
-array_read
----------
-.. autofunction:: paddle.v2.fluid.layers.array_read
- :noindex:
-
-
-shrink_memory
----------
-.. autofunction:: paddle.v2.fluid.layers.shrink_memory
- :noindex:
-
-
-array_length
----------
-.. autofunction:: paddle.v2.fluid.layers.array_length
- :noindex:
-
-
-conv2d_transpose
----------
-.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
- :noindex:
-
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
deleted file mode 100644
index 2c3d075422..0000000000
--- a/doc/api/v2/fluid/nets.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-===========
-Nets
-===========
-
-simple_img_conv_pool
------------
-.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
- :noindex:
-
-
-img_conv_group
------------
-.. autofunction:: paddle.v2.fluid.nets.img_conv_group
- :noindex:
-
-
-sequence_conv_pool
------------
-.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
- :noindex:
-
-
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
deleted file mode 100644
index 233762fcdf..0000000000
--- a/doc/api/v2/fluid/optimizer.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-===========
-Optimizer
-===========
-
-Optimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: Optimizer
- :noindex:
-
-
-SGDOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: SGDOptimizer
- :noindex:
-
-
-
-MomentumOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: MomentumOptimizer
- :noindex:
-
-
-
-AdagradOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdagradOptimizer
- :noindex:
-
-
-AdamOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdamOptimizer
- :noindex:
-
-
-AdamaxOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdamaxOptimizer
- :noindex:
-
-
-DecayedAdagradOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: DecayedAdagradOptimizer
- :noindex:
-
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
deleted file mode 100644
index ca0c8af9e8..0000000000
--- a/doc/api/v2/fluid/param_attr.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-===========
-ParamAttr
-===========
-
-
-
-ParamAttr
------------
-.. automodule:: paddle.v2.fluid.param_attr
- :members: ParamAttr
- :noindex:
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
deleted file mode 100644
index 7d4042d1f4..0000000000
--- a/doc/api/v2/fluid/profiler.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-===========
-Profiler
-===========
-
-
-
-Profiler
------------
-.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
- :noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
deleted file mode 100644
index 3af2b07d2a..0000000000
--- a/doc/api/v2/fluid/regularizer.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-===========
-Regularizer
-===========
-
-WeightDecayRegularizer
------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: WeightDecayRegularizer
- :noindex:
-
-
-L2DecayRegularizer
------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: L2DecayRegularizer
- :noindex:
-
-
-
-L1DecayRegularizer
------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: L1DecayRegularizer
-
-
diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md
deleted file mode 100644
index f9991541bc..0000000000
--- a/doc/design/auto_gradient_check.md
+++ /dev/null
@@ -1,146 +0,0 @@
-## Auto Gradient Checker Design
-
-## Backgraound:
-- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
- 1. you should get the right backpropagation formula according to the forward computation.
- 2. you should implement it right in CPP.
- 3. it's difficult to prepare test data.
-
-- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
- 1. numerical gradient checker only need forward operator.
- 2. user only need to prepare the input data for forward Operator.
-
-## Mathematical Theory
-The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
-
-- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
-- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
-
-
-## Numeric Gradient Implementation
-### Python Interface
-```python
-def get_numerical_gradient(op,
- input_values,
- output_name,
- input_to_check,
- delta=0.005,
- local_scope=None):
- """
- Get Numeric Gradient for an operator's input.
-
- :param op: C++ operator instance, could be an network
- :param input_values: The input variables. Should be an dictionary, whose key is
- variable name, and value is numpy array.
- :param output_name: The final output variable name.
- :param input_to_check: The input variable with respect to which to compute the gradient.
- :param delta: The perturbation value for numeric gradient method. The
- smaller delta is, the more accurate result will get. But if that delta is
- too small, it will suffer from numerical stability problem.
- :param local_scope: The local scope used for get_numeric_gradient.
- :return: The gradient array in numpy format.
- """
-```
-
-### Explaination:
-
-- Why need `output_name`
- - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
-
-- Why need `input_to_check`
- - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
-
-
-### Core Algorithm Implementation
-
-
-```python
- # we only compute gradient of one element a time.
- # we use a for loop to compute the gradient of each element.
- for i in xrange(tensor_size):
- # get one input element by its index i.
- origin = tensor_to_check.get_float_element(i)
-
- # add delta to it, run op and then get the new value of the result tensor.
- x_pos = origin + delta
- tensor_to_check.set_float_element(i, x_pos)
- y_pos = get_output()
-
- # plus delta to this element, run op and get the new value of the result tensor.
- x_neg = origin - delta
- tensor_to_check.set_float_element(i, x_neg)
- y_neg = get_output()
-
- # restore old value
- tensor_to_check.set_float_element(i, origin)
-
- # compute the gradient of this element and store it into a numpy array.
- gradient_flat[i] = (y_pos - y_neg) / delta / 2
-
- # reshape the gradient result to the shape of the source tensor.
- return gradient_flat.reshape(tensor_to_check.get_dims())
-```
-
-## Auto Graident Checker Framework
-
-Each Operator Kernel has three kinds of Gradient:
-
-1. Numerical gradient
-2. CPU kernel gradient
-3. GPU kernel gradient (if supported)
-
-The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
-
-1. calculate the numerical gradient
-2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
-3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
-
-#### Python Interface
-
-```python
- def check_grad(self,
- forward_op,
- input_vars,
- inputs_to_check,
- output_name,
- no_grad_set=None,
- only_cpu=False,
- max_relative_error=0.005):
- """
- :param forward_op: used to create backward_op
- :param input_vars: numpy value of input variable. The following
- computation will use these variables.
- :param inputs_to_check: the input variable with respect to which to compute the gradient.
- :param output_name: The final output variable name.
- :param max_relative_error: The relative tolerance parameter.
- :param no_grad_set: used when create backward ops
- :param only_cpu: only compute and check gradient on cpu kernel.
- :return:
- """
-```
-
-### How to check if two numpy array is close enough?
-if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
-
-```python
-numerical_grad = ...
-operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
-
-abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
-# error.
-abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
-
-diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
-max_diff = numpy.max(diff_mat)
-```
-
-
-#### Notes:
-The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
-
-
-#### Refs:
-
-- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
-- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/executor.md b/doc/design/executor.md
deleted file mode 100644
index b5fb6c5c3c..0000000000
--- a/doc/design/executor.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Executor Design Doc
-
-## Motivation
-
-We use executor to do the runtime evaluation of a `ProgramDesc`.
-
-## Overview
-
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
-
-### What does executor do?
-
-It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
-
-### What does executor NOT do?
-
-It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
-
-It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
-
-## Implementation
-
-`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
diff --git a/doc/design/images/replica.png b/doc/design/images/replica.png
deleted file mode 100644
index ef59e56b01..0000000000
Binary files a/doc/design/images/replica.png and /dev/null differ
diff --git a/doc/design/images/two_phase_commit.png b/doc/design/images/two_phase_commit.png
deleted file mode 100644
index ef6f7317bd..0000000000
Binary files a/doc/design/images/two_phase_commit.png and /dev/null differ
diff --git a/doc/design/refactor/parameter_server.md b/doc/design/refactor/parameter_server.md
deleted file mode 100644
index fa3c5d7990..0000000000
--- a/doc/design/refactor/parameter_server.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Design Doc: Operation Graph Based Parameter Server
-
-## Abstract
-
-We propose an approach to implement the parameter server. In this
-approach, there is no fundamental difference between the trainer and
-the parameter server: they both run subgraphs, but subgraphs of
-different purposes.
-
-## Background
-
-The previous implementations of the parameter server does not run a
-subgraph. parameter initialization, optimizer computation, network
-communication and checkpointing are implemented twice on both the
-trainer and the parameter server.
-
-It would be great if we can write code once and use them on both the
-trainer and the parameter server: reduces code duplication and
-improves extensibility. Given that after the current refactor, we are
-representing everything as a computing graph on the
-trainer. Representing everything as a computing graph on the parameter
-server becomes a natural extension.
-
-## Design
-
-### Graph Converter
-
-The *graph converter* converts the user-defined operation (OP) graph
-into subgraphs to be scheduled on different nodes with the following
-steps:
-
-1. OP placement: the OPs will be placed on different nodes according
- to heuristic that minimizes estimated total computation
- time. Currently we will use a simple heuristic that puts parameter
- varable on parameter server workers and everything else on trainer
- workers.
-
-1. Add communication OPs to enable the communication between nodes.
-
-We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
-
-Below is an example of converting the user defined graph to the
-subgraphs for the trainer and the parameter server:
-
-
-
-After converting:
-
-
-
-1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
-1. Operators are added to the subgraphs.
- - *Send* sends data to the connected *Recv* operator. The
- scheduler on the receive node will only schedule *Recv* operator
- to run when the *Send* operator has ran (the *Send* OP will mark
- the *Recv* OP runnable automatically).
- - *Enueue* enqueues the input variable, it can block until space
- become available in the queue.
- - *Dequeue* outputs configurable numbers of tensors from the
- queue. It will block until the queue have the required number of
- tensors.
-
-
-### Benefits
-
-- Model parallelism become easier to implement: it's an extension to
- the trainer - parameter server approach. we already have the
- communication OPs, but need to extend the graph converter's
- placement functionality.
-
-- User-defined optimizer is easier to add - user can now express it as
- a subgraph.
-
-- No more duplication logic inside the trainer and the parameter
- server mentioned in the background section.
-
-### Challenges
-
-- It might be hard for the graph converter to cut a general graph
- (without any hint for which subgraph is the optimizer). We may need
- to label which subgraph inside the OP graph is the optimizer.
-
-- It's important to balance the parameter shards of on multiple
- parameter server. If a single parameter is very big (some
- word-embedding, fully connected, softmax layer), we need to
- automatically partition the single parameter onto different
- parameter servers when possible (only element-wise optimizer depends
- on the parameter variable).
-
-### Discussion
-
-- In the "Aync SGD" figure, the "W" variable on the parameter server
- could be read and wrote concurrently, what is our locking strategy?
- E.g., each variable have a lock cpp method to be invoked by every
- OP, or, have a lock OP.
-
-- Can the Enqueue OP be implemented under our current tensor design
- (puts the input tensor into the queue tensor)?
-
-- *Dequeue* OP will have variable numbers of output (depends on the
- `min_count` attribute), does our current design support it? (similar
- question for the *Add* OP)
-
-
-### References:
-[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md
deleted file mode 100644
index 1d9a26683c..0000000000
--- a/doc/design/refactor/session.md
+++ /dev/null
@@ -1,180 +0,0 @@
-# Design Doc: Session
-
-## Abstract
-
-The *session* object encapsulates the environment in which the
-computation graph is executed.
-
-We will have the *local* session and *remote* session, they offer the
-same [interface](#interface). The local session encapsulates the local
-runtime environment and the remote session encapsulates the cluster
-runtime environment.
-
-The local runtime environment contains:
-
-1. computation devices (i.e., CPU, GPU) handles, and
-1. the [scope](../scope.md) which holds all variables.
-
-The remote runtime environment contains:
-
-1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
- and
-1. the distributed [scope](../scope.md) in a cluster which holds all
- variables.
-
-The user can create a remote session on Paddle Cloud and evaluate the
-computation graph with it. In this way, the user can control the
-remote computation resource in a cluster from his local computer.
-
-
-## Background
-
-The current design has an implicit global session in which
-`paddle.eval()` is executed. The pain point is:
-
-Since the user is not able to explicitly switch between runtime
-environments, the user cannot run a topology in two independent
-environments.
-
-For example, in reinforcement learning, the user may want to have a
-stale model for inference and a fresh model for training, and only
-replace the stale model with the fresh model periodically.
-
-Furthermore, we have no concept that encapsulates a remote environment
-that executes a computation graph.
-
-We need the session object to address above issues.
-
-
-## Session
-
-A session is an object that owns the runtime environment. All
-computations are executed through `session.eval()`.
-
-
-### Interface
-
-```python
-eval(
- targets,
- feed_dict=None,
-)
-```
-
-Evaluates the target Operations or Variables in `targets`.
-
-- *targets*: the evaluation targets. Can be a single Operation or
- Variable, or a list with the Operations or Variables as
- elements. The value returned by `eval()` has the same shape as the
- `target` argument.
-
- The PaddlePaddle program is represented by
- the [ProgramDesc](../design/program.md), `eval()` will infer the
- ProgramDesc from the given targets and run the PaddlePaddle
- program. Please
- see
- [this graph](./distributed_architecture.md#local-training-architecture) for
- the detailed illustration for the local session
- and
- [this graph](./distributed_architecture.md#distributed-training-architecture) for
- the detailed illustration for the remote session.
-
-- *feed_dict*: a dictionary that contains the tensors which override
- the edges of the computation graph.
-
- feed_dict not only can provide the input data, it can override any
- OP's input as well:
-
- ```python
- a = pd.constant(2.0, name="a")
- b = pd.variable(name="b")
- c = pd.mul(a,b)
- sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
- ```
-
-```python
-close()
-```
-
-Closes the session and releases the scope that the session owns.
-
-
-### Create a Local Session
-
-```python
-session(
- devices=None
-)
-```
-
-Creates a new session. One session owns one global scope, so creating
-multiple sessions will create different scopes.
-
-- *devices*: a single `string` or a list of `string` of device names,
- the corresponding devices will be the computation devices for
- `eval()`. If not specified, all available devices (e.g., all GPUs)
- will be used. The user doesn't need to specify the CPU device since
- it will be always used. Multiple sessions can use the same device.
-
-
-#### Example
-
-```Python
-a = paddle.constant(1.0)
-b = paddle.constant(2.0)
-c = a + b
-sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
-sess.eval(c)
-sess.close()
-```
-
-### Create a Remote Session
-
-```python
-create_cloud_job(
- name,
- num_trainer,
- mem_per_trainer,
- gpu_per_trainer,
- cpu_per_trainer,
- num_ps,
- mem_per_ps,
- cpu_per_ps,
-)
-```
-
-Creates a Paddle Cloud job. Fails if the job name exists.
-
-```python
-get_cloud_job(
- name
-)
-```
-
-Gets a Paddle Cloud job.
-
-```python
-remote_session(
- job
-)
-```
-
-- *job*: the Paddle Cloud job.
-
-#### Example
-
-```Python
-reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
-image = reader.column(0)
-label = reader.column(1)
-fc1 = paddle.op.fc(image, size=256, act="sigmoid")
-fc2 = paddle.op.fc(fc1, size=10, act="softmax")
-cost = paddle.op.cross_entropy(fc2, label)
-opt = paddle.optimizer.sgd(cost)
-
-job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
-sess = paddle.remote_ession(job)
-for i in range(1000):
- sess.eval(opt)
-sess.close()
-```
diff --git a/doc/design/refactor/src/distributed_architecture.graffle b/doc/design/refactor/src/distributed_architecture.graffle
deleted file mode 100644
index f8496e5732..0000000000
Binary files a/doc/design/refactor/src/distributed_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/distributed_architecture.png b/doc/design/refactor/src/distributed_architecture.png
deleted file mode 100644
index 410c4510c6..0000000000
Binary files a/doc/design/refactor/src/distributed_architecture.png and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.graffle b/doc/design/refactor/src/local_architecture.graffle
deleted file mode 100644
index cc7783c453..0000000000
Binary files a/doc/design/refactor/src/local_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.png b/doc/design/refactor/src/local_architecture.png
deleted file mode 100644
index 4b999538b7..0000000000
Binary files a/doc/design/refactor/src/local_architecture.png and /dev/null differ
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
deleted file mode 100644
index 0b2958c1b1..0000000000
--- a/doc/design/var_desc.md
+++ /dev/null
@@ -1,69 +0,0 @@
-## Background
-PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
-
-PaddlePaddle use proto message to describe compile time graph because
-
-1. Computation graph should be able to be saved to a file.
-1. In distributed training, the graph will be serialized and send to multiple workers.
-
-The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
-
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
-
-
-## Definition of VarDesc
-
-A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`.
-
-```proto
-message VarDesc {
- required string name = 1;
- enum VarType {
- LOD_TENSOR = 0;
- SELECTED_ROWS = 1;
- }
- required VarType type = 2;
- optional LoDTensorDesc lod_desc = 3;
- optional TensorDesc selected_rows_desc = 4;
- optional bool persistable = 5 [ default = false ];
-}
-```
-
-## Definition of TensorDesc
-
-```proto
-enum DataType {
- BOOL = 0;
- INT16 = 1;
- INT32 = 2;
- INT64 = 3;
- FP16 = 4;
- FP32 = 5;
- FP64 = 6;
-}
-
-message TensorDesc {
- required DataType data_type = 1;
- repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-}
-```
-
-A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
-
-## Definition of LodTensorDesc
-
-```proto
-message LoDTensorDesc {
- required TensorDesc tensor = 1;
- optional int lod_level = 2;
-}
-```
-
-A LoDTensorDesc contains a tensor and a lod_level.
-
-## Definition of Variable in Python
-
-For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
deleted file mode 100644
index 9929767cac..0000000000
--- a/doc/faq/index_cn.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-FAQ
-====
-
-.. toctree::
- :maxdepth: 1
-
- build_and_install/index_cn.rst
- model/index_cn.rst
- parameter/index_cn.rst
- local/index_cn.rst
- cluster/index_cn.rst
diff --git a/doc/faq/local/src/reduce_min_pool_size.py b/doc/faq/local/src/reduce_min_pool_size.py
deleted file mode 100644
index 5715397cc1..0000000000
--- a/doc/faq/local/src/reduce_min_pool_size.py
+++ /dev/null
@@ -1,6 +0,0 @@
-@provider(min_pool_size=0, ...)
-def process(settings, filename):
- os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before.
- with open('%s.shuf' % filename, 'r') as f:
- for line in f:
- yield get_sample_from_line(line)
diff --git a/doc/faq/local/src/word2vec_config.py b/doc/faq/local/src/word2vec_config.py
deleted file mode 100644
index 866b40c3d4..0000000000
--- a/doc/faq/local/src/word2vec_config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-... # the settings and define data provider is omitted.
-DICT_DIM = 3000 # dictionary dimension.
-word_ids = data_layer('word_ids', size=DICT_DIM)
-
-emb = embedding_layer(
- input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
-emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
-predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
-outputs(
- classification_cost(
- input=predict, label=data_layer(
- 'label', size=DICT_DIM)))
diff --git a/doc/faq/local/src/word2vec_dataprovider.py b/doc/faq/local/src/word2vec_dataprovider.py
deleted file mode 100644
index ec2753a7d0..0000000000
--- a/doc/faq/local/src/word2vec_dataprovider.py
+++ /dev/null
@@ -1,10 +0,0 @@
-DICT_DIM = 3000
-
-
-@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
-def process(settings, filename):
- with open(filename) as f:
- # yield word ids to predict inner word id
- # such as [28, 29, 10, 4], 4
- # It means the sentance is 28, 29, 4, 10, 4.
- yield read_next_from_file(f)
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
new file mode 100644
index 0000000000..cc999f5a8d
--- /dev/null
+++ b/doc/fluid/CMakeLists.txt
@@ -0,0 +1,49 @@
+if(NOT DEFINED SPHINX_THEME)
+ set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+ set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+ "${BINARY_BUILD_DIR_EN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_fluid_docs
+ html
+ ${BINARY_BUILD_DIR_EN}
+ ${SPHINX_CACHE_DIR_EN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+ "${BINARY_BUILD_DIR_CN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_fluid_docs_cn
+ html
+ ${BINARY_BUILD_DIR_CN}
+ ${SPHINX_CACHE_DIR_CN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_CN})
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 100644
index 0000000000..9276236f9f
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1,2 @@
+安装与使用
+------------
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 100644
index 0000000000..cc1e61a58a
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1,2 @@
+Build and Install
+------------
diff --git a/doc/design/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif
similarity index 100%
rename from doc/design/images/asgd.gif
rename to doc/fluid/design/algorithm/images/asgd.gif
diff --git a/doc/design/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif
similarity index 100%
rename from doc/design/images/theta_star.gif
rename to doc/fluid/design/algorithm/images/theta_star.gif
diff --git a/doc/design/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
similarity index 100%
rename from doc/design/parameter_average.md
rename to doc/fluid/design/algorithm/parameter_average.md
diff --git a/doc/design/build_system/README.md b/doc/fluid/design/concepts/README.md
similarity index 100%
rename from doc/design/build_system/README.md
rename to doc/fluid/design/concepts/README.md
diff --git a/doc/design/block.md b/doc/fluid/design/concepts/block.md
similarity index 97%
rename from doc/design/block.md
rename to doc/fluid/design/concepts/block.md
index 4066122c0e..907a2def55 100644
--- a/doc/design/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -202,8 +202,8 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
-VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
```python
a = pd.Variable(shape=[20, 20])
@@ -291,10 +291,10 @@ public:
}
void Run(const framework::Scope& scope,
- const platform::DeviceContext& dev_ctx) const override {
+ const platform::Place& place) const override {
PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
for (auto& op : runtime_table_.ops()) {
- op->Run(scope, dev_ctx);
+ op->Run(scope, place);
}
}
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
new file mode 100644
index 0000000000..8607b40ccb
--- /dev/null
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -0,0 +1,171 @@
+# C++ Data Feeding
+
+While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
+
+In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
+
+## Overview
+
+
+
+## Reader
+
+In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
+
+
+### ReaderBase
+
+`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
+
+```cpp
+class ReaderBase {
+ public:
+ // Reads the next batch of data. (A 'batch' can be only one instance)
+ // If the next batch doesn't exist, it throws an exception
+ virtual void ReadNext(std::vector* out) = 0;
+
+ // Checks whether the next instance exists.
+ virtual bool HasNext() = 0;
+
+ // Reinitializes the reader and read the file from the beginning.
+ virtual void ReInit() = 0;
+
+ virtual ~ReaderBase();
+};
+```
+
+### FileReader
+
+`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
+
+```cpp
+class FileReader : public ReaderBase {
+ public:
+ explicit FileReader(const std::vector& dims);
+
+ void ReadNext(std::vector* out) override;
+
+ protected:
+ virtual void ReadNextImpl(std::vector* out) = 0;
+
+ private:
+ std::vector dims_;
+};
+```
+
+A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
+
+The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.
+
+### DecoratedReader
+
+A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling, batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+```cpp
+class DecoratedReader : public ReaderBase {
+ public:
+ explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+ PADDLE_ENFORCE_NOT_NULL(reader_);
+ }
+
+ void ReInit() override { reader_->ReInit(); }
+
+ bool HasNext() const override { return reader_->HasNext(); }
+
+ protected:
+ ReaderBase* reader_;
+};
+```
+
+Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
+
+### MultipleReader
+
+All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
+
+So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
+
+
+
+This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
+
+To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel.
+
+### ReaderHolder
+
+Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get("batch_reader");
+```
+
+We would have to write:
+
+```cpp
+var->Get("batch_reader");
+```
+
+This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some new ops are introduced:
+
+### CreateReaderOp
+
+Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
+
+However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
+
+### OpenFilesOp
+
+The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
+
+To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
+
+### HasNextOp
+
+`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
+
+### ResetOp
+
+`ResetOp` is used to reset a reader via its `ReInit()` interface.
+
+### ReadOp
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
+
+## Program with Readers
+
+A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
+
+The ops of a `startup_program` with readers would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+double_buffer_reader = create_double_buffer_op(batch_reader)
+... (other initializers)
+```
+
+The forwarding ops of the corresponding `main_program` would be like this:
+
+```
+while_op {
+ has_next = has_next_op(double_buffer_reader)
+ if_else_op(has_next) {
+ batch_data = read_op(double_buffer_reader)
+ ... (subsequent training ops)
+ } else {
+ reset_op(double_buffer_reader)
+ }
+}
+```
+
+Two important considerations for these programs are as follows:
+
+1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+
+2. All readers exist in both `startup_program` and `main_program`. And they are persistable.
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
new file mode 100644
index 0000000000..2d4b371cc5
--- /dev/null
+++ b/doc/fluid/design/concepts/executor.md
@@ -0,0 +1,29 @@
+# Executor Design Doc
+
+## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
+
+## Executor
+
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
+
+### The interface
+```c++
+ Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
+
+### Running an Executor
+
+```
+ void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/design/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
similarity index 100%
rename from doc/design/functions_operators_layers.md
rename to doc/fluid/design/concepts/functions_operators_layers.md
diff --git a/doc/fluid/design/concepts/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png
new file mode 100644
index 0000000000..b22126b31d
Binary files /dev/null and b/doc/fluid/design/concepts/images/multiple_reader.png differ
diff --git a/doc/fluid/design/concepts/images/readers.png b/doc/fluid/design/concepts/images/readers.png
new file mode 100644
index 0000000000..fd59168ce1
Binary files /dev/null and b/doc/fluid/design/concepts/images/readers.png differ
diff --git a/paddle/framework/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
similarity index 100%
rename from paddle/framework/lod_tensor.md
rename to doc/fluid/design/concepts/lod_tensor.md
diff --git a/doc/design/program.md b/doc/fluid/design/concepts/program.md
similarity index 100%
rename from doc/design/program.md
rename to doc/fluid/design/concepts/program.md
diff --git a/doc/design/scope.md b/doc/fluid/design/concepts/scope.md
similarity index 100%
rename from doc/design/scope.md
rename to doc/fluid/design/concepts/scope.md
diff --git a/paddle/framework/tensor.md b/doc/fluid/design/concepts/tensor.md
similarity index 99%
rename from paddle/framework/tensor.md
rename to doc/fluid/design/concepts/tensor.md
index 7a80816d8e..0a27ac9bb6 100644
--- a/paddle/framework/tensor.md
+++ b/doc/fluid/design/concepts/tensor.md
@@ -71,7 +71,7 @@ private:
```
```c++
-typedef boost::variant Place;
+typedef boost::variant Place;
typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
typedef boost::variant<
diff --git a/doc/design/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md
similarity index 100%
rename from doc/design/tensor_array.md
rename to doc/fluid/design/concepts/tensor_array.md
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
new file mode 100644
index 0000000000..6a45af1995
--- /dev/null
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -0,0 +1,81 @@
+## Background
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
+
+PaddlePaddle uses proto message to describe compile time program because :
+
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.
+
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below.
+
+| |compile time|runtime|
+|---|---|---|
+|Data|VarDesc(proto)|Variable(cpp)|
+|Operation|OpDesc(proto)|Operator(cpp)|
+
+
+## Definition of VarType
+
+A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+
+```proto
+message VarDesc {
+ required string name = 1;
+ required VarType type = 2;
+ optional bool persistable = 3 [ default = false ];
+}
+```
+
+## Definition of TensorDesc
+
+```proto
+message TensorDesc {
+ // Should only be PODType. Is enforced in C++
+ required Type data_type = 1;
+ repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+```
+
+The `Type` here comes from the enum defined inside of `VarType` :
+
+```proto
+enum Type {
+ // Pod Types
+ BOOL = 0;
+ INT16 = 1;
+ INT32 = 2;
+ INT64 = 3;
+ FP16 = 4;
+ FP32 = 5;
+ FP64 = 6;
+
+ // Other types that may need additional descriptions
+ LOD_TENSOR = 7;
+ SELECTED_ROWS = 8;
+ FEED_MINIBATCH = 9;
+ FETCH_LIST = 10;
+ STEP_SCOPES = 11;
+ LOD_RANK_TABLE = 12;
+ LOD_TENSOR_ARRAY = 13;
+ PLACE_LIST = 14;
+ READER = 15;
+ CHANNEL = 16;
+}
+```
+
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
+
+## Definition of LodTensorDesc
+
+```proto
+message LoDTensorDesc {
+ required TensorDesc tensor = 1;
+ optional int32 lod_level = 2 [ default = 0 ];
+}
+```
+
+A LoDTensorDesc contains a tensor and a lod_level.
+
+## Definition of Variable in Python
+
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/paddle/framework/variable.md b/doc/fluid/design/concepts/variable.md
similarity index 100%
rename from paddle/framework/variable.md
rename to doc/fluid/design/concepts/variable.md
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
new file mode 100644
index 0000000000..f022e67fd3
--- /dev/null
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -0,0 +1,163 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+| Go | Fluid |
+|----|-------|
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
+| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
+| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
+| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+ X = fluid.read(...)
+ W = fluid.Tensor(...)
+ Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, W, Y],
+ ops = [
+ read(output = X)
+ assign(input = ..., output = W)
+ mult(input = {X, W}, output = Y)
+ ],
+ }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+ paddlepaddle()
+ fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, L, Y],
+ ops = [
+ read(output = X)
+ kube_get_workers_addrs(output = L)
+ Y = tensor_array(len(L))
+ parallel_for(input = X, output = Y,
+ attrs = {L, block_id(1)}) # referring to block 1
+ ]
+ }
+
+ block[1] = Block {
+ parent = 0,
+ vars = [x, y, index],
+ ops = [
+ slice(input = [X, index], output = x) # index is initialized by parallel_for
+ send(input = x, attrs = L[index])
+ recv(outputs = y, attrs = L[index])
+ assign(input = y, output = Y[index])
+ ]
+ }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() { //// block 0
+ X = fluid.read(...)
+ L = fluid.k8s.get_worker_addrs()
+ Y = fluid.tensor_array(len(L))
+ fluid.parallel_for(X, L,
+ func(index int) { //// block 1
+ x = X[index]
+ fluid.send(L[index], x)
+ y = fluid.recv(L[index])
+ Y[index] = y
+ })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+
+ 1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+ 2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread
+ 1. creates an Executor instance, and
+ 2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+ W = Tensor(...)
+ x = fluid.listen_and_do(
+ fluid.k8s.self_addr(),
+ func(input Tensor) {
+ output = fluid.mult(input, W)
+ })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+ 1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+ 2. once a connection is established,
+ 1. creates a scope of two parameters, "input" and "output",
+ 2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+ 3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+ 1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+ 2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
new file mode 100644
index 0000000000..10d936860f
--- /dev/null
+++ b/doc/fluid/design/concurrent/csp.md
@@ -0,0 +1,224 @@
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning. Few example applications are:
+
+1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+| concurrent programming model | implementation |
+|-----|-----|
+| mutex | types and functions in standard libraries |
+| semaphore | types and functions in standard libraries |
+| communicating sequential processes (CSP) | Go programming language |
+| actor model | Erlang programming language |
+| message passing | MPI |
+| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch := make(chan int) // a channel without buffer
+ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor...> >`.
+
+### Send and Recv
+
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+ ```go
+ ch := make(chan int) // this is an unbuffered channel
+ ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+ ```
+
+1. Send
+
+ ```go
+ ch <- 111
+ ```
+
+1. Recv
+
+ ```go
+ y, ok <- ch
+ ```
+
+1. Close
+
+ ```go
+ close(ch)
+ ```
+
+ Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+ fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+ fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+ fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
+### Select
+
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1 := make(chan int)
+ch2 := make(chan int, 100)
+
+x := 0
+
+for {
+ select {
+ case ch1 <- x:
+ x := x + 1
+ case y <- ch2:
+ fmt.Println("Received on channel")
+ default:
+ fmt.Println("Default")
+ }
+ }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1 = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+ fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+ fluid.print("Received on Channel")
+
+with sel.default():
+ fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000..c18b788e80
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread. It works in conjuction with CSP operators (channel_send,
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+ # Send a tensor of value 99 to "channel" on a detached thread
+ tensor = fill_constant(shape=[1], dtype='int', value=99)
+ tensor.stop_gradient = True
+ fluid.channel_send(channel, tensor)
+
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)
+fluid.channel_recv(ch, result)
+```
+
+The go operator can be accessed by using the fluid.Go() control flow. This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block. Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope. Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below. Take note of
+the **go_op** in particular. It is added as an operator in the current
+block (in this example, block0). The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a
+detached thread.
+
+```
+blocks {
+ idx: 0
+ parent_idx: -1
+ vars {
+ name: "return_value"
+ type {
+ type: LOD_TENSOR
+ lod_tensor {
+ tensor {
+ data_type: INT64
+ }
+ }
+ }
+ }
+ vars {
+ name: "status_recv"
+ type {
+ type: LOD_TENSOR
+ lod_tensor {
+ tensor {
+ data_type: BOOL
+ }
+ }
+ }
+ }
+ ...
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "channel"
+ }
+ type: "channel_create"
+ attrs {
+ name: "data_type"
+ type: INT
+ i: 7
+ }
+ attrs {
+ name: "capacity"
+ type: INT
+ i: 0
+ }
+ }
+ ops {
+ inputs {
+ parameter: "X"
+ arguments: "channel"
+ }
+ type: "go"
+ attrs {
+ name: "sub_block"
+ type: BLOCK
+ block_idx: 1
+ }
+ }
+ ops {
+ inputs {
+ parameter: "Channel"
+ arguments: "channel"
+ }
+ outputs {
+ parameter: "Out"
+ arguments: "return_value"
+ }
+ outputs {
+ parameter: "Status"
+ arguments: "status_recv"
+ }
+ type: "channel_recv"
+ }
+ ...
+}
+
+blocks {
+ idx: 1
+ parent_idx: 0
+ vars {
+ name: "status"
+ type {
+ type: LOD_TENSOR
+ lod_tensor {
+ tensor {
+ data_type: BOOL
+ }
+ }
+ }
+ }
+ ...
+
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_1.tmp_0"
+ }
+ type: "fill_constant"
+ attrs {
+ name: "force_cpu"
+ type: BOOLEAN
+ b: false
+ }
+ attrs {
+ name: "value"
+ type: FLOAT
+ f: 99.0
+ }
+ attrs {
+ name: "shape"
+ type: INTS
+ ints: 1
+ }
+ attrs {
+ name: "dtype"
+ type: INT
+ i: 3
+ }
+ }
+ ops {
+ inputs {
+ parameter: "Channel"
+ arguments: "channel"
+ }
+ inputs {
+ parameter: "X"
+ arguments: "fill_constant_1.tmp_0"
+ }
+ outputs {
+ parameter: "Status"
+ arguments: "status"
+ }
+ type: "channel_send"
+ attrs {
+ name: "copy"
+ type: BOOLEAN
+ b: false
+ }
+ }
+```
+
+## Current Limitations
+
+#### Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block. When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new
+child block. After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited. If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault. As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed. Currently, the go_op will explicitly enforce
+this requirement and raise an exception if a variable could not be found in
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to
+manage multiple threads (instead of natively by the OS). Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads. For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation. Please use go_op with
+non training operators.
diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png
new file mode 100644
index 0000000000..719ed76f9d
Binary files /dev/null and b/doc/fluid/design/concurrent/images/select_op_workflow.png differ
diff --git a/doc/fluid/design/concurrent/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md
new file mode 100644
index 0000000000..42bd136f82
--- /dev/null
+++ b/doc/fluid/design/concurrent/parallel_do.md
@@ -0,0 +1,163 @@
+# Design Doc: Parallel_Do in PaddlePaddle
+
+In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.
+
+## Design overview
+
+The definition of a parallel_do op looks like the following
+
+```c++
+AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable();
+AddInput(kParameters, "Parameters are duplicated over different devices")
+ .AsDuplicable();
+AddInput(kPlaces, "Devices used for parallel processing");
+AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable();
+AddOutput(kParallelScopes,
+ "Scopes for all local variables in forward pass. One scope for each device");
+AddAttr(kParallelBlock,
+ "List of operaters to be executed in parallel");
+```
+
+A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and
+`||||` means multiple threads)
+
+```
+In the forward pass
+ | Split input onto different devices
+ | Copy parameter onto different devices
+ |||| Compute forward pass in parallel
+ | Merge output from different devices
+
+In the backward pass
+ | Split output@grad onto different devices
+ |||| Compute backward pass in parallel
+ | accumulate param@grad from different devices to the first device
+ | Merge input@grad from different devices
+ | Copy param@grad to the place of parallel_do_op
+```
+
+This implementation allows to write mixed device program like this
+
+```python
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)
+
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
+# parallel processing on multiple GPUs
+pd = ParallelDo(gpu_places)
+with pd.do(input=data):
+ prediction = softmax(fc(fc(data, W1), W2))
+ write_output(prediction)
+prediction = pd()
+loss = cross_entropy(prediction, label)
+```
+
+And the programDesc are like the following
+
+```
+# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
+start_program
+{
+ vars: w1, w2
+ ops: init(w1), init(w2)
+}
+
+main_program
+{
+block0 {
+ vars: data, places, w1, w2, w1_grad, w2_grad,
+ ops: data, get_place, parallel_do(block1),
+ parallel_do_grad(block2),
+ sgd(w2, w2_grad),
+ sgd(w1, w1_grad)
+}
+block1 { # the forward pass
+ parent_block: 0
+ vars: data, h1, h2, loss
+ ops: fc, fc, softmax
+}
+block2 { # the backward pass
+ parent_block: 1
+ vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
+ ops: softmax_grad,
+ fc_grad
+ fc_grad
+}
+}
+```
+
+## Performance Imporvement
+
+There are serial places we can make this parallel_do faster.
+
+### forward: split input onto different devices
+
+If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by
+prefetching the input onto different devices in a seperate background thread. And the python code
+looks like this.
+```python
+pd = ParallelDo(gpu_places)
+with pd.do():
+ feature = get_data_from_prefetch_queue(gpu_places)
+ prediction = my_net(feature)
+ write_output(activation)
+```
+
+### forward: Copy parameter to onto different devices
+
+We can avoid this step by making each device have a copy of the parameter. This requires:
+
+1. `fluid.default_start_up_program()` to be run on all devices
+1. In the backward, allreduce param@grad at different devices, this requires
+ 1. `backward.py` add `allreduce` operators at parallel_do_grad
+ 1. `allreduce` operators need to be called in async mode to achieve maximum throughput
+1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel
+
+By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device".
+And the ProgramDesc looks like the following
+
+```
+# w1, w2 will be allocated on all GPUs
+start_program
+{
+block0 {
+ parallel_do(block1)
+}
+block1 {
+ parent_block: 0
+ vars: w1, w2
+ ops: init(w1), init(w2)
+}
+}
+
+main_program
+{
+block0 {
+ vars: data, places, w1, w2
+ ops: data, get_place, parallel_do(block1),
+ parallel_do_grad(block2), # append_backward
+ parallel_do(block3) # append_optimization
+
+}
+block1 {
+ parent_block: 0
+ vars: data, h1, h2, loss
+ ops: fc, fc, softmax
+}
+block2 {
+ parent_block: 1
+ vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+ ops: softmax_grad,
+ fc_grad, allreduce(places, scopes, w1_grad),
+ fc_grad, allreduce(places, scopes, w2_grad)
+}
+block3 {
+ parent_block: 0
+ vars: lr
+ ops: sgd(w2, w2_grad),
+ sgd(w1, w1_grad)
+}
+}
+```
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
new file mode 100644
index 0000000000..52c226bc94
--- /dev/null
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -0,0 +1,265 @@
+# select_op Design
+
+## Introduction
+
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case. If multiple cases are ready to run, then one case is
+choosen at random to be executed.
+
+With the introduction of CSP for Paddle, we mimic this behavior by
+creating a ***select_op***.
+
+## How to use it
+
+The **select_op** is available as a c++ operator. However most users
+will prefer to use the much simplier Python API.
+
+- **fluid.Select()**: Creates a select operator and adds it to the current
+block within the main program. Also creates a sub block and adds it to the
+main program. This sub block is used to hold all variables and operators
+used by the case statements.
+
+Within the select block, users can add cases by
+calling **select.case** or **select.default** method.
+
+- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
+a fluid channel send/recv case. This method creates a SelectCase block
+guard and adds it to the Select block. The arguments into this method tells
+the select which channel operation to listen to.
+
+- **fluid.Select.default()**: Represents the fluid default case. This default
+case is executed if none of the channel send/recv cases are available to
+execute.
+
+**Example:**
+```
+ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+
+while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+while_op = While(cond=while_cond)
+
+with while_op.block():
+ with fluid.Select() as select:
+ with select.case(fluid.channel_send, channel, x):
+ # Send x, then perform Fibonacci calculation on x and y
+ x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+ assign(input=x, output=x_tmp)
+ assign(input=y, output=x)
+ assign(elementwise_add(x=x_tmp, y=y), output=y)
+ with select.case(fluid.channel_recv, quit_channel, result2):
+ # Exit out of While loop
+ while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+ helper = layer_helper.LayerHelper('assign')
+ helper.append_op(
+ type='assign',
+ inputs={'X': [while_false]},
+ outputs={'Out': [while_cond]})
+```
+
+## How it Works
+
+### Program Description
+
+```
+blocks {
+ idx: 0
+ ...
+ // Create "case_to_execute" variable
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_110.tmp_0"
+ }
+ type: "fill_constant"
+ attrs {
+ name: "force_cpu"
+ type: BOOLEAN
+ b: false
+ }
+ attrs {
+ name: "value"
+ type: FLOAT
+ f: -1.0
+ }
+ attrs {
+ name: "shape"
+ type: INTS
+ ints: 1
+ }
+ attrs {
+ name: "dtype"
+ type: INT
+ i: 2
+ }
+ }
+ // Create "select" operator.
+ // inputs:
+ // X: All input variables used by operators within the select block
+ // case_to_execute: Variable filled in by select_op when it determines
+ // which case to execute.
+ //
+ // outputs:
+ // Out: All output variables referenced by operators within select block.
+ //
+ // attrs:
+ // sub_block: The block id containing the select "cases"
+ // cases: Serialized list of all cases in the select op.
+ // Each case is serialized as: ',,,'
+ // where type is 0 for default, 1 for send, and 2 for receive.
+ // No channel and values are needed for default cases.
+ ops {
+ inputs {
+ parameter: "X"
+ arguments: "fill_constant_103.tmp_0"
+ arguments: "fill_constant_104.tmp_0"
+ }
+ inputs {
+ parameter: "case_to_execute"
+ arguments: "fill_constant_110.tmp_0"
+ }
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_110.tmp_0"
+ }
+ type: "select"
+ attrs {
+ name: "sub_block"
+ type: BLOCK
+ block_idx: 1
+ }
+ attrs {
+ name: "cases"
+ type: STRINGS
+ strings: "0,1,channel_101,fill_constant_109.tmp_0"
+ strings: "1,2,channel_102,fill_constant_108.tmp_0"
+ }
+ }
+ ...
+}
+```
+
+The python select API will add the **select_op** to the current block. In addition, it will
+iterate through all it's case statements and add any input variables required by case statements
+into **X**. It will also create a temp variable called **case_to_execute**. This variable is
+filled in by the select_op after it has completed processing the case statements.
+
+If there are no available cases to execute (ie: all cases are blocked on channel operations, and
+there is no default statement), then the select_op will block the current thread. The thread will
+unblock once there is a channel operation affecting one of the case statements, at which point, the
+**select_op** will set the **case_to_execute** variable to the index of the case to execute.
+
+Finally the select_op will call executor.run on the **sub_block**.
+
+```
+blocks {
+ idx: 1
+ parent_idx: 0
+ ...
+ // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_111.tmp_0"
+ }
+ type: "fill_constant"
+ attrs {
+ name: "force_cpu"
+ type: BOOLEAN
+ b: false
+ }
+ attrs {
+ name: "value"
+ type: FLOAT
+ f: 0.0
+ }
+ attrs {
+ name: "shape"
+ type: INTS
+ ints: 1
+ }
+ attrs {
+ name: "dtype"
+ type: INT
+ i: 2
+ }
+ }
+ // Create an "equal" operator to compare the case index with the "case_to_execute"
+ // tensor (which was filled in by the select op).
+ ops {
+ inputs {
+ parameter: "X"
+ arguments: "fill_constant_111.tmp_0" // case 0
+ }
+ inputs {
+ parameter: "Y"
+ arguments: "fill_constant_110.tmp_0" // case_to_execute
+ }
+ outputs {
+ parameter: "Out"
+ arguments: "equal_0.tmp_0"
+ }
+ type: "equal"
+ attrs {
+ name: "axis"
+ type: INT
+ i: -1
+ }
+ }
+ // Use the output of the "equal" operator as a condition for the "conditional_block".
+ // If the condition evaluates to true, then execute the "sub_block" (which represents
+ // the select case's body)
+ ops {
+ inputs {
+ parameter: "Params"
+ }
+ inputs {
+ parameter: "X"
+ arguments: "equal_0.tmp_0"
+ }
+ outputs {
+ parameter: "Out"
+ }
+ outputs {
+ parameter: "Scope"
+ arguments: "_generated_var_0"
+ }
+ type: "conditional_block"
+ attrs {
+ name: "is_scalar_condition"
+ type: BOOLEAN
+ b: true
+ }
+ attrs {
+ name: "sub_block"
+ type: BLOCK
+ block_idx: 4
+ }
+ }
+ ...
+ // Repeat the above operators for each case statements inside the select body
+}
+
+```
+
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**). Since each case index is unique in this sub-block,
+only one case will be executed.
+
+### select_op flow
+
+
+
+
+
+The select algorithm is inspired by golang's select routine. Please refer to
+http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
+
+## Backward Pass
+
+TODO
diff --git a/doc/design/float16.md b/doc/fluid/design/data_type/float16.md
similarity index 100%
rename from doc/design/float16.md
rename to doc/fluid/design/data_type/float16.md
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
similarity index 51%
rename from doc/design/refactor/distributed_architecture.md
rename to doc/fluid/design/dist_train/distributed_architecture.md
index d9fe7d6bbb..a405cb6aaf 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -1,4 +1,4 @@
-# Design Doc: Distributed Training Architecture
+# Design Doc: Fluid Distributed Training Architecture
## Abstract
@@ -52,8 +52,9 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
-This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
## Distributed Training Architecture
@@ -61,68 +62,111 @@ The revamped distributed training architecture can address the above discussed l
-The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
-### PaddlePaddle Python
+### Python API
-PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
```Python
-paddle.init()
-input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
-img, label = input[0], input[1]
-hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
-prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
-cost = paddle.layer.classification_cost(input=prediction, label=label)
-optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
-session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
-for i in range(1000):
- _, cost_val = session.eval(targets=[cost, optimizer])
- print cost_val
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.mnist.train(), buf_size=500),
+ batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+ for data in train_reader():
+ loss, acc = exe.run(trainer_prog,
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
```
-The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
-
-#### session.eval
-
-As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
-The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
-
-The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
-
-### PaddlePaddle Converter
-
-The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
-
-1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
-
-2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
-
-3. Optimize the computation graph.
-
-4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
-
-5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+ distributed training program:
+ 1. Parse configurations from `RemoteExecutor`.
+ 1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+ 1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+ DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+ "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost],
+ job_desc=JobDesc(
+ jobname,
+ num_trainer,
+ num_pserver,
+ cpu_per_trainer,
+ gpu_per_trainer,
+ mem_per_trainer,
+ cpu_per_pserver,
+ mem_per_pserver
+ ))
+for data in train_reader():
+ loss, acc = exe.run(trainer_prog,
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+```
-6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
-7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
+
-The output IRs will be cached to optimize the conversion latency.
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
-#### Placement Algorithm
+### Placement Algorithm
Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
-### PaddlePaddle Runtime
-
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
-
-
### Local Training Architecture
The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
@@ -132,9 +176,18 @@ The local training architecture will be the same as the distributed training arc
### Training Data
-In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
-
-When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
## References:
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
new file mode 100644
index 0000000000..e543adf0f9
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -0,0 +1,128 @@
+## Design Doc: Distributed Lookup Table Operator
+
+A lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+
+## Background
+
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+
+### The Forward Algorithm
+
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+
+$$y = x * W$$
+
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x). Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say, two symbols, and a lookup table W:
+
+
+
+### The Backward Algorithm
+
+The backward algorithm computes W'(x) using W(x). W'(x) has the same
+scale of size as W(x) and is much smaller than W.
+
+To optimize W given W', we can do simple SGD update:
+
+$$W = f(W') = \lambda * W'$$
+
+or some more sophisticated algorithms that rely on both W' and W:
+
+$$W = f(W, W')$$
+
+The following figure illustrates the backward pass of the lookup
+operator: 
+
+## Distributed Storage Service
+
+The forward algorithm requires a distributed storage service for W.
+The backward algorithm prefers that the storage system can apply the
+optimization algorithm on W. The following two sections describe two
+solutions -- the former doesn't require that the storage service can
+do optimization, the latter does.
+
+### Storage Service Doesn't Optimize
+
+In this design, we use highly-optimized distributed storage, e.g.,
+memcached, as the storage service, and we run the optimization
+algorithm on parameter servers of PaddlePaddle. The following figure
+illustrates the training process.
+
+
+
+
+
+Each trainer runs the forward and backward passes using their local
+data:
+
+1. In the forward pass, when a trainer runs the forward algorithm of a
+ lookup operator, it retrieves W(x) from the storage service.
+1. The trainer computes W'(x) in the backward pass using W(x).
+
+During the global update process:
+
+1. Each trainer uploads its W'(x) to parameter servers.
+1. The parameter server runs the optimization algorithm, e.g., the
+ Adam optimization algorithm, which requires that
+ 1. The parameter server retrieves W(x) from memcached, and
+ 1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
+ W'(x))$ to memcached, where $f$ denotes the optimization
+ algorithm.
+
+### Storage Service Does Optimize
+
+This design is very similar to the above one, except that the
+optimization algorithm $f$ runs on the storage service.
+
+- Pro: parameter servers do not retrieve W(x) from the storage
+ service, thus saves half network communication.
+- Con: the storage service needs to be able to run the optimization
+ algorithm.
+
+## Conclusion
+
+Let us do the "storage service does not optimize" solution first, as a
+baseline at least, because it is easier to use a well-optimized
+distributed storage service like memcached. We can do the "storage
+service does optimize" solution later or at the same time, which, if
+implemented carefully, should have better performance than the former.
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
new file mode 100644
index 0000000000..a8d8ee0422
--- /dev/null
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+
+
+After converted:
+
+
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+ which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+ for the atomic counter become `0`:
+ ```cpp
+ BlockingCounter bc(thread_count);
+ for (int i = 0; i < thread_count; ++i) {
+ thread_pool->Start([&bc] {bc.DecrementCount(); })
+ }
+ bc.Wait();
+ ```
+- `ParallelDo` Operator
+ - Initialize a thread pool which is a Singleton.
+ - Use a block id as the input, and create run the specify Block on independent scope
+ with multi-threads.
+ - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+ with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+ assign the parameters to the different threads and execute
+ optimizer with multi-threads.
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
new file mode 100644
index 0000000000..6ce48dfbfc
--- /dev/null
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -0,0 +1,107 @@
+# Design Doc: Parameter Server
+
+## Abstract
+
+We propose an approach to implement the parameter server. In this
+approach, there is no fundamental difference between the trainer and
+the parameter server: they both run subgraphs, but subgraphs of
+different purposes.
+
+## Background
+
+The previous implementations of the parameter server do not run a
+fluid sub-program. Parameter initialization, optimizer computation, network
+communication and checkpointing are implemented twice on both the
+trainer as well as the parameter server.
+
+It would be great if we can write code once and use them on both: the
+trainer and the parameter server, since this reduces code duplication and
+improves extensibility. Given that after the current refactoring, we are
+representing everything as a computation graph on the
+trainer. Representing everything as a computation graph on the parameter
+server becomes a natural extension.
+
+## Design
+
+### Distributed Transpiler
+
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
+steps:
+
+1. OP placement: the OPs will be placed on different nodes according
+ to a heuristic that minimizes the estimated total computation
+ time. Currently we will use a simple heuristic that puts parameter
+ variable on parameter server workers and everything else on trainer
+ workers.
+1. Add communication OPs to enable the communication between nodes.
+
+We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
+
+Below is an example of converting the user defined graph to the
+subgraphs for the trainer and the parameter server:
+
+
+
+After converting:
+
+
+
+1. The parameter variable W and its optimizer program are placed on the parameter server.
+1. Operators are added to the program.
+ - *Send* sends data to the connected *Recv* operator. The
+ scheduler on the receive node will only schedule *Recv* operator
+ to run when the *Send* operator has ran (the *Send* OP will mark
+ the *Recv* OP runnable automatically).
+ - *Enqueue* enqueues the input variable, it can block until space
+ become available in the queue.
+ - *Dequeue* outputs configurable numbers of tensors from the
+ queue. It will block until the queue has the required number of
+ tensors.
+
+### Sparse Update
+
+For embedding layers, the gradient may have many rows containing only 0 when training,
+if the gradient uses a dense tensor to do parameter optimization,
+it could spend unnecessary memory, slow down the calculations and waste
+the bandwidth while doing distributed training.
+In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
+non-zero gradient data. So when we do parameter optimization both locally and remotely,
+we only need to send those non-zero rows to the optimizer operators:
+
+
+
+### Benefits
+
+- Model parallelism becomes easier to implement: it is an extension to
+ the trainer - parameter server approach. We can have several "Transpilers"
+ to achieve different goals.
+- User-defined optimizer is easier to add - user can now express it as
+ a sub-program.
+- No more duplication logic inside the trainer and the parameter
+ server mentioned in the background section.
+
+### Challenges
+
+- It is important to balance the parameter shards on multiple
+ parameter servers. If a single parameter is very big (for example: some
+ word-embedding, fully connected, softmax layer), we need to
+ automatically partition the single parameter onto different
+ parameter servers when possible (only element-wise optimizer depends
+ on the parameter variable).
+- In the "Async SGD" figure, the "W" variable on the parameter server
+ could be read and written concurrently. See
+ [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+ details about concurrent program in Fluid.
+
+### Discussion
+
+- Can the Enqueue OP be implemented under our current tensor design
+ (put the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depending on the
+ `min_count` attribute), does our current design support it? (similar
+ question for the *Add* OP)
+
+### References
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
diff --git a/doc/design/refactor/src/compiler.graffle b/doc/fluid/design/dist_train/src/compiler.graffle
similarity index 100%
rename from doc/design/refactor/src/compiler.graffle
rename to doc/fluid/design/dist_train/src/compiler.graffle
diff --git a/doc/design/refactor/src/compiler.png b/doc/fluid/design/dist_train/src/compiler.png
similarity index 100%
rename from doc/design/refactor/src/compiler.png
rename to doc/fluid/design/dist_train/src/compiler.png
diff --git a/doc/design/refactor/src/dist-graph.graffle b/doc/fluid/design/dist_train/src/dist-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/dist-graph.graffle
rename to doc/fluid/design/dist_train/src/dist-graph.graffle
diff --git a/doc/design/refactor/src/dist-graph.png b/doc/fluid/design/dist_train/src/dist-graph.png
similarity index 100%
rename from doc/design/refactor/src/dist-graph.png
rename to doc/fluid/design/dist_train/src/dist-graph.png
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.graffle b/doc/fluid/design/dist_train/src/distributed_architecture.graffle
new file mode 100644
index 0000000000..d1b6014134
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.png b/doc/fluid/design/dist_train/src/distributed_architecture.png
new file mode 100644
index 0000000000..29c7b0c078
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.png differ
diff --git a/doc/design/refactor/src/local-graph.graffle b/doc/fluid/design/dist_train/src/local-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/local-graph.graffle
rename to doc/fluid/design/dist_train/src/local-graph.graffle
diff --git a/doc/design/refactor/src/local-graph.png b/doc/fluid/design/dist_train/src/local-graph.png
similarity index 100%
rename from doc/design/refactor/src/local-graph.png
rename to doc/fluid/design/dist_train/src/local-graph.png
diff --git a/doc/fluid/design/dist_train/src/local_architecture.graffle b/doc/fluid/design/dist_train/src/local_architecture.graffle
new file mode 100644
index 0000000000..49fcc663eb
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.png b/doc/fluid/design/dist_train/src/local_architecture.png
new file mode 100644
index 0000000000..14adc9fd72
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png
new file mode 100644
index 0000000000..72dfe3547f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png
new file mode 100644
index 0000000000..cc7cc4aeb3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table_training.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads.graffle b/doc/fluid/design/dist_train/src/multi-threads.graffle
new file mode 100644
index 0000000000..e71173715f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads.graffle differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000..e40a869987
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000..4083aebfdd
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/design/refactor/src/paddle-compile.graffle b/doc/fluid/design/dist_train/src/paddle-compile.graffle
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.graffle
rename to doc/fluid/design/dist_train/src/paddle-compile.graffle
diff --git a/doc/design/refactor/src/paddle-compile.png b/doc/fluid/design/dist_train/src/paddle-compile.png
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.png
rename to doc/fluid/design/dist_train/src/paddle-compile.png
diff --git a/doc/fluid/design/dist_train/src/remote_executor.graffle b/doc/fluid/design/dist_train/src/remote_executor.graffle
new file mode 100644
index 0000000000..41b2067311
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.graffle differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.png b/doc/fluid/design/dist_train/src/remote_executor.png
new file mode 100644
index 0000000000..744e2fb2e0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.png differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.graffle b/doc/fluid/design/dist_train/src/sparse_update.graffle
new file mode 100644
index 0000000000..08d689a58f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.png b/doc/fluid/design/dist_train/src/sparse_update.png
new file mode 100644
index 0000000000..8c872e6ac4
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.png differ
diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
similarity index 100%
rename from doc/design/ops/images/2_level_rnn.dot
rename to doc/fluid/design/dynamic_rnn/2_level_rnn.dot
diff --git a/doc/design/ops/images/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
similarity index 100%
rename from doc/design/ops/images/2_level_rnn.png
rename to doc/fluid/design/dynamic_rnn/2_level_rnn.png
diff --git a/doc/design/ops/images/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot
similarity index 100%
rename from doc/design/ops/images/rnn.dot
rename to doc/fluid/design/dynamic_rnn/rnn.dot
diff --git a/doc/design/ops/images/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg
similarity index 100%
rename from doc/design/ops/images/rnn.jpg
rename to doc/fluid/design/dynamic_rnn/rnn.jpg
diff --git a/doc/design/ops/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
similarity index 95%
rename from doc/design/ops/rnn.md
rename to doc/fluid/design/dynamic_rnn/rnn.md
index 2f4854793f..6f414e5549 100644
--- a/doc/design/ops/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
## RNN Algorithm Implementation
-
+
The above diagram shows an RNN unrolled into a full network.
@@ -22,7 +22,7 @@ There are several important concepts here:
There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
-
+
Figure 2 illustrates the RNN's data flow
@@ -49,7 +49,7 @@ or copy the memory value of the previous step to the current ex-memory variable.
### Usage in Python
-For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
We can define an RNN's step-net using a Block:
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
-
+
```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
-
+
diff --git a/doc/design/ops/images/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png
similarity index 100%
rename from doc/design/ops/images/rnn.png
rename to doc/fluid/design/dynamic_rnn/rnn.png
diff --git a/doc/design/ops/images/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
similarity index 100%
rename from doc/design/ops/images/rnn_2level_data.dot
rename to doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
diff --git a/doc/design/ops/images/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
similarity index 100%
rename from doc/design/ops/images/rnn_2level_data.png
rename to doc/fluid/design/dynamic_rnn/rnn_2level_data.png
diff --git a/paddle/operators/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
similarity index 100%
rename from paddle/operators/rnn_design.md
rename to doc/fluid/design/dynamic_rnn/rnn_design.md
diff --git a/doc/design/if_else_op.md b/doc/fluid/design/execution/if_else_op.md
similarity index 100%
rename from doc/design/if_else_op.md
rename to doc/fluid/design/execution/if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
new file mode 100644
index 0000000000..827d0601c6
--- /dev/null
+++ b/doc/fluid/design/execution/switch.md
@@ -0,0 +1,31 @@
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+ with switch.case(fluid.less_equal(a, 10)):
+ fluid.print("Case 1")
+ with switch.case(fluid.larger(a, 0)):
+ fluid.print("Case 2")
+ with switch.default():
+ fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
new file mode 100644
index 0000000000..f1887be690
--- /dev/null
+++ b/doc/fluid/design/index_cn.rst
@@ -0,0 +1,2 @@
+设计思想
+------------
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
new file mode 100644
index 0000000000..18a4b4122f
--- /dev/null
+++ b/doc/fluid/design/index_en.rst
@@ -0,0 +1,2 @@
+Design
+------------
diff --git a/doc/design/multi_language_interface/00.why_plain_c.md b/doc/fluid/design/interface/00.why_plain_c.md
similarity index 100%
rename from doc/design/multi_language_interface/00.why_plain_c.md
rename to doc/fluid/design/interface/00.why_plain_c.md
diff --git a/doc/design/multi_language_interface/01.inference_implementation.md b/doc/fluid/design/interface/01.inference_implementation.md
similarity index 100%
rename from doc/design/multi_language_interface/01.inference_implementation.md
rename to doc/fluid/design/interface/01.inference_implementation.md
diff --git a/paddle/memory/README.md b/doc/fluid/design/memory/README.md
similarity index 91%
rename from paddle/memory/README.md
rename to doc/fluid/design/memory/README.md
index 6cb003c50b..7cf61d089b 100644
--- a/paddle/memory/README.md
+++ b/doc/fluid/design/memory/README.md
@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
To allocate 4KB memory on the 3rd GPU:
```cpp
-p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
```
To free memory and check the so-far used amount of memory on a place:
```cpp
-auto pl = platform::GPUPlace(0);
+auto pl = platform::CUDAPlace(0);
p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl);
memory::Free(pl, p);
@@ -36,7 +36,7 @@ template size_t Used(Place);
} // namespace memory
```
-These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
```cpp
template<>
@@ -49,7 +49,7 @@ and
```cpp
template<>
-void Alloc(GPUPlace p, size_t size) {
+void Alloc(CUDAPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size);
}
```
@@ -122,7 +122,7 @@ There are two implementations of `Context`:
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
### Majel
diff --git a/doc/fluid/design/memory/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png
new file mode 100644
index 0000000000..3579998e58
Binary files /dev/null and b/doc/fluid/design/memory/images/control_flow_graph.png differ
diff --git a/doc/fluid/design/memory/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png
new file mode 100644
index 0000000000..c10f7f69f4
Binary files /dev/null and b/doc/fluid/design/memory/images/dataflow_equations.png differ
diff --git a/doc/fluid/design/memory/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png
new file mode 100644
index 0000000000..026becc4d9
Binary files /dev/null and b/doc/fluid/design/memory/images/deep_learning.png differ
diff --git a/doc/fluid/design/memory/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md
new file mode 100644
index 0000000000..285464ada7
--- /dev/null
+++ b/doc/fluid/design/memory/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
+
+Following graph shows the details:
+
+
+
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference.
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
+
+#### In-place Operation
+In a relu activation operator:
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
+
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+
+
+#### Dataflow Analysis
+
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+ def __init__(self, Program):
+ self._sucessors = defaultdict(set)
+ self._presucessors = defaultdict(set)
+ self._uses = defaultdict(set)
+ self._defs = defaultdict(set)
+ self._live_in = defaultdict(set)
+ self._live_out = defaultdict(set)
+ self._program = Program
+
+ def build(self):
+ pass
+
+ def dataflow_analysis(self):
+ pass
+
+ def memory_optimization(self):
+ pass
+
+ def get_program(self):
+ return self._program
+```
+
+#### Make dataflow analysis
+
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+ i --> pool
+ pool --> o
+else:
+ pool --> o
+ i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/fluid/design/modules/backward.md b/doc/fluid/design/modules/backward.md
new file mode 100644
index 0000000000..20fda7a98f
--- /dev/null
+++ b/doc/fluid/design/modules/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place.
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+ """
+ Append backward part to main_program
+
+ Args:
+ loss(Variable): The variable generated by the cost function.
+ parameter_list(list): Parameters that need to be updated by optimizers.
+ If None, it means all parameters need to be updated.
+
+ no_grad_set(set): Variables that have no gradients in Block 0.
+ If None, the set will be generated inside the function and
+ contains all variables with `step_gradient=True` from all blocks.
+
+ Return:
+ (list[Variable]): list of (parameters, gradients) pair.
+ """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building.
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables.
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+ block,
+ target_block,
+ no_grad_dict,
+ grad_to_var):
+ """
+ Create all grad ops, and insert them into given block
+
+ Args:
+ target(Variable): the target variable of forward pass
+ block(Block): the block where forward ops are
+ target_block(Block): the block which is going to hold new generated grad ops
+ no_grad_dict(dict):
+ key(int) block index
+ val(set) a set of varibale names. These varibales have no gradient
+ grad_to_var(dict)(output argument):
+ key(str): grad variable name
+ val(str): corresponding forward variable name
+ """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`.
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+ if op has an attribute named 'sub_block':
+ Get the sub-block(`s_block`) from op's attribute.
+ Create a new block(`grad_s_block`), whose father is `s_block`.
+ Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+
+ Invoke `core.get_grad_op_desc()` to get op's grad_op.
+ Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+ Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+ Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up.
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass.
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros.
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False).
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block,
+ start_op_idx,
+ grad_to_var,
+ grad_info_map):
+ """
+ Create new variables required by backward pass.
+
+ Args:
+ block(Block): the block where new variables will be created
+ start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+ grad_to_var(dict):
+ key(str): grad variable name
+ val(str): corresponding forward variable name
+ In most cases, this dict is generated by _append_backward_ops_()
+ grad_info_map(dict)(output argument):
+ key(str): forward variable name
+ val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+ """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+ if op has an attribute named 'sub_block':
+ Get the sub-block(`s_block`) from op's attribute.
+ Invoke _append_backward_vars_(), with `block=s_block`
+
+ for var_name in op.all_output_names():
+ if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+ continue
+ create a new variable named 'var_name' in block
+ if grad_to_var.has_key(var_name):
+ set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+
+ do op's var type inference
+ do op's shape inference
+```
diff --git a/paddle/operators/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
similarity index 98%
rename from paddle/operators/batch_norm_op.md
rename to doc/fluid/design/modules/batch_norm_op.md
index 80948adf2b..d1392619c4 100644
--- a/paddle/operators/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
The following graph showes the training computational process of `batch_norm_op`:
-
+
cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
@@ -124,7 +124,7 @@ for pass_id in range(PASS_NUM):
`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
-
+
Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
diff --git a/doc/design/evaluator.md b/doc/fluid/design/modules/evaluator.md
similarity index 100%
rename from doc/design/evaluator.md
rename to doc/fluid/design/modules/evaluator.md
diff --git a/paddle/operators/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot
similarity index 100%
rename from paddle/operators/images/batch_norm_fork.dot
rename to doc/fluid/design/modules/images/batch_norm_fork.dot
diff --git a/paddle/operators/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png
similarity index 100%
rename from paddle/operators/images/batch_norm_fork.png
rename to doc/fluid/design/modules/images/batch_norm_fork.png
diff --git a/paddle/operators/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
similarity index 100%
rename from paddle/operators/images/batch_norm_op_kernel.png
rename to doc/fluid/design/modules/images/batch_norm_op_kernel.png
diff --git a/doc/design/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png
similarity index 100%
rename from doc/design/images/feed_forward.png
rename to doc/fluid/design/modules/images/feed_forward.png
diff --git a/doc/design/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png
similarity index 100%
rename from doc/design/images/feed_forward_regularized.png
rename to doc/fluid/design/modules/images/feed_forward_regularized.png
diff --git a/doc/design/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png
similarity index 100%
rename from doc/design/images/l1_regularization.png
rename to doc/fluid/design/modules/images/l1_regularization.png
diff --git a/doc/design/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png
similarity index 100%
rename from doc/design/images/l2_regularization.png
rename to doc/fluid/design/modules/images/l2_regularization.png
diff --git a/doc/design/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png
similarity index 100%
rename from doc/design/images/loss_equation.png
rename to doc/fluid/design/modules/images/loss_equation.png
diff --git a/doc/design/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md
similarity index 100%
rename from doc/design/infer_var_type.md
rename to doc/fluid/design/modules/infer_var_type.md
diff --git a/paddle/operators/net_op_design.md b/doc/fluid/design/modules/net_op_design.md
similarity index 100%
rename from paddle/operators/net_op_design.md
rename to doc/fluid/design/modules/net_op_design.md
diff --git a/doc/design/optimizer.md b/doc/fluid/design/modules/optimizer.md
similarity index 97%
rename from doc/design/optimizer.md
rename to doc/fluid/design/modules/optimizer.md
index 202b4b6510..691081c268 100644
--- a/doc/design/optimizer.md
+++ b/doc/fluid/design/modules/optimizer.md
@@ -79,7 +79,7 @@ class Optimizer(object):
def minimize(self, loss, parameter_list):
"""Add operations to minimize `loss` by updating `parameter_list`.
- This method combines interface `append_backward_ops()` and
+ This method combines interface `append_backward()` and
`create_optimization_pass()` into one.
"""
params_grads = self.create_backward_pass(loss, parameter_list)
diff --git a/doc/design/prune.md b/doc/fluid/design/modules/prune.md
similarity index 100%
rename from doc/design/prune.md
rename to doc/fluid/design/modules/prune.md
diff --git a/doc/design/python_api.md b/doc/fluid/design/modules/python_api.md
similarity index 92%
rename from doc/design/python_api.md
rename to doc/fluid/design/modules/python_api.md
index cb5fdc765b..73f6d7b90c 100644
--- a/doc/design/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -279,6 +279,26 @@ class LayerHelper(object):
return tmp
```
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator. However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
## Optimizer
[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md
similarity index 100%
rename from doc/design/register_grad_op.md
rename to doc/fluid/design/modules/register_grad_op.md
diff --git a/doc/design/regularization.md b/doc/fluid/design/modules/regularization.md
similarity index 100%
rename from doc/design/regularization.md
rename to doc/fluid/design/modules/regularization.md
diff --git a/doc/design/selected_rows.md b/doc/fluid/design/modules/selected_rows.md
similarity index 100%
rename from doc/design/selected_rows.md
rename to doc/fluid/design/modules/selected_rows.md
diff --git a/doc/design/api.md b/doc/fluid/design/motivation/api.md
similarity index 100%
rename from doc/design/api.md
rename to doc/fluid/design/motivation/api.md
diff --git a/doc/fluid/design/motivation/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle
new file mode 100644
index 0000000000..c933df2cb8
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.graffle differ
diff --git a/doc/fluid/design/motivation/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png
new file mode 100644
index 0000000000..1b0ffed203
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.png differ
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
new file mode 100644
index 0000000000..110b7d78bf
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid.md
@@ -0,0 +1,114 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+ m = read_minibatch()
+ forward({input=x, data=m}, minimize=c)
+ backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+ m = read_minibatch()
+ x = m["image"]
+ l = m["label"]
+ f = layer.fc(x, W)
+ s = layer.softmax(f)
+ c = layer.mse(l, s)
+ backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+ m = read_minibatch()
+ x = m["sentence"]
+ for t in xrange x.len():
+ h[t] = the_step(x[t])
+```
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+ m = read_minibatch()
+ x = m["sentence"]
+ rnn = layers.While(...)
+ with rnn.block():
+ h[t] = the_step(input[t])
+```
+
+An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
new file mode 100644
index 0000000000..2a6beafc52
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -0,0 +1,110 @@
+# PaddlePaddle Fluid: Towards a Compiled Programming Language
+
+As described in [fluid.md](fluid.md), when a Fluid application program
+runs, it generates a `ProgramDesc` protobuf message as an intermediate
+representation of itself. The C++ class `Executor` can run this
+protobuf message as an interpreter. This article describes the Fluid
+compiler.
+
+
+
+## ProgramDesc
+
+Before we go deeper into the idea of compiled language, let us take a
+look at a simple example Fluid application.
+
+```python
+import "fluid"
+
+func paddlepaddle() {
+ X = fluid.read(...)
+ W = fluid.Tensor(...)
+ Y = fluid.mult(X, W)
+}
+```
+
+This program consists of a [block](block.md) of three operators --
+`read`, `assign`, and `mult`. Its `ProgramDesc` message looks like
+the following
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, W, Y],
+ ops = [
+ read(output = X)
+ assign(input = ..., output = W)
+ mult(input = {X, W}, output = Y)
+ ],
+ }
+}
+```
+
+## Transpilers
+
+We can write a transpiler program that takes a `ProgramDesc`, e.g.,
+the above one, and outputs another `ProgramDesc`. Let us take some
+examples:
+
+1. *Memory optimization transpiler*: We can write a transpiler that
+ inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
+ to free memory early, before the end of an iteration, so to keep a
+ small memory footprint.
+
+1. *Distributed training transpiler*: We can write a transpiler that
+ converts a`ProgramDesc` into its distributed version of two
+ `ProgramDesc`s -- one for running by the trainer processes and the
+ other for the parameter server.
+
+In the rest of this article, we talk about a special kind of
+transpiler, *Native code generator*, which takes a `ProgramDesc` and
+generates a `.cu` (or `.cc`) file, which could be built by C++
+compilers (gcc, nvcc, icc) into binaries.
+
+## Native Code Generator
+
+For the above example, the native code generator transpiler, say, the
+CUDA code generator, should generate a `main` function:
+
+```c++
+void main() {
+ auto X = fluid_cuda_read(...);
+ auto W = fluid_cuda_create_tensor(...);
+ auto Y = fluid_cuda_mult(X, W);
+}
+```
+
+and the definitions of functions `fluid_cuda_read`,
+`fluid_cuda_create_tensor`, and `fluid_cuda_mult`. Please be aware
+that each function could just define a C++ instance of an operator and
+run it. For example
+
+```c++
+paddle::Tensor fluid_cuda_read(...) {
+ paddle::Tensor t;
+ paddle::operator::Read r(&t, ...);
+ r.Run();
+ return t;
+}
+```
+
+For computational operators that have multiple *kernels*, each for a
+specific hardware platform, for example, the `mult` operator, the
+generated code should call its CUDA kernel:
+
+```c++
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
+ const paddle::Tensor& b) {
+ paddle::Tensor t;
+ paddle::operator::Mult m(a, b, ...);
+ Mult.Run(cuda_context);
+}
+```
+
+where `cuda_context` could be a global variable of type
+`paddle::CUDADeviceContext`.
+
+## Multi-Block Code Generation
+
+Most Fluid application programs may have more than one blocks. To
+execute them, we need to trace [scopes](scope.md).
diff --git a/doc/design/refactorization.md b/doc/fluid/design/motivation/refactorization.md
similarity index 100%
rename from doc/design/refactorization.md
rename to doc/fluid/design/motivation/refactorization.md
diff --git a/doc/fluid/design/muti_devices/kernel_hint_design.md b/doc/fluid/design/muti_devices/kernel_hint_design.md
new file mode 100644
index 0000000000..a54b7da045
--- /dev/null
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
@@ -0,0 +1,57 @@
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+ Place place_;
+ DataType data_type_;
+ LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+ if (Attr(kForceCPU)) {
+ return KernelType(CPUPlace, ...)
+ } else {
+ ...
+ }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+ layer_helper = LayerHelper(...)
+ layer_helper.append_op(
+ type="xx",
+ attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/fluid/design/muti_devices/kernel_selection.md b/doc/fluid/design/muti_devices/kernel_selection.md
new file mode 100644
index 0000000000..9719e031c7
--- /dev/null
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
@@ -0,0 +1,99 @@
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
+
+The `OpKernelType ` is as follows:
+
+```cpp
+struct OpKernelType {
+ Place place_;
+ DataType data_type_;
+ DataLayout data_layout_;
+ LibraryType library_type_;
+};
+```
+
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
+
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
+
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+
+```
+OP1(CPUPlace)
+ |
+ op1_2_op2
+ |
+OP2(CPUPlace)
+```
+
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+
+Problems under these situations are similar. We can formalize this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution: data transform
+
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
+
+The algorithm is described as following
+
+```cpp
+void OperatorWithKernel::Run(
+ const Scope& scope,
+ const platform::Place& place) const {
+ ExecutionContext ctx(...);
+ auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+ Scope& new_scope = scope.NewScope();
+
+ for (auto& var_name : this->Inputs()) {
+ auto* tensor_in = GetTensor(var_name);
+ auto kernel_type_for_var = this->GetKernelTypeForVar(...);
+ if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
+ auto* trans_var = new_scope.Var(var_name);
+ auto* out = DataTransform(expected_kernel_key,
+ kernel_type_for_var,
+ *tensor_in);
+ CopyVariableWithTensor(...);
+ }
+ }
+
+ auto kernel = kernels.find(expected_kernel_key);
+ kernel->Compute(ExecutionContext(...));
+}
+```
+
+then the actual process for the multi-device above will be:
+
+```
+OP1(CPUPlace)
+ |
+op1_2_op2(on CPU)
+ |
+[transform](from CPU to GPU)
+ |
+op1_2_op2(on GPU)
+ |
+OP2(CUDAPlace)
+```
diff --git a/doc/fluid/design/muti_devices/operator_kernel_type.md b/doc/fluid/design/muti_devices/operator_kernel_type.md
new file mode 100644
index 0000000000..f86e6b7a56
--- /dev/null
+++ b/doc/fluid/design/muti_devices/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+ platform::Place place_;
+ proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
+
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
+
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+ platform::Place place_;
+ platform::Library library_;
+ proto::DataType data_type_;
+ framework::Layout layout_;
+};
+```
+
+The details are as follows:
+
+### Place
+
+`Place` is defined as:
+
+```cpp
+typedef boost::variant Place;
+```
+
+`Place` represents the device memory where data is located.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
+
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not at compile-time.
+
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+ kNCHW,
+ kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+ knChw8c
+ ...
+#endif
+};
+```
diff --git a/doc/design/speech/README.MD b/doc/fluid/design/network/deep_speech_2.md
similarity index 84%
rename from doc/design/speech/README.MD
rename to doc/fluid/design/network/deep_speech_2.md
index 7304650e62..af0c6ef36f 100644
--- a/doc/design/speech/README.MD
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -94,7 +94,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
- **One** CTC-loss layer
-
+
Figure 1. Archetecture of Deep Speech 2 Network.
@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM
-TODO by Assignees
+
+
+Figure 2. Algorithm for CTC Beam Search Decoder.
+
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+ - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+ - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
## Future Work
@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
similarity index 100%
rename from doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
rename to doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
diff --git a/doc/fluid/design/network/images/beam_search.png b/doc/fluid/design/network/images/beam_search.png
new file mode 100644
index 0000000000..7f7e35f342
Binary files /dev/null and b/doc/fluid/design/network/images/beam_search.png differ
diff --git a/doc/design/speech/image/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png
similarity index 100%
rename from doc/design/speech/image/ds2_network.png
rename to doc/fluid/design/network/images/ds2_network.png
diff --git a/doc/design/ops/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
similarity index 98%
rename from doc/design/ops/sequence_decoder.md
rename to doc/fluid/design/network/sequence_decoder.md
index 9db5fb8e9a..c4a9bbeeef 100644
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clarity.
-The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python
[[0, 3, 9]
[0, 2, 3, 3, 3, 9]]
@@ -119,7 +119,7 @@ def generate():
encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
decoder_input = pd.fc(
act=pd.activation.Linear(),
- input=[target_word, encoder_ctx],
+ input=[target_word, encoder_ctx_expanded],
size=3 * decoder_dim)
gru_out, cur_mem = pd.gru_step(
decoder_input, mem=decoder_mem, size=decoder_dim)
diff --git a/doc/fluid/design/others/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md
new file mode 100644
index 0000000000..773b7b6a76
--- /dev/null
+++ b/doc/fluid/design/others/auto_gradient_check.md
@@ -0,0 +1,150 @@
+## Auto Gradient Check Design
+
+## Background:
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+ 1. The formula for backpropagation formula should be correct according to the forward computation.
+ 2. The Implementation of the above shoule be correct in CPP.
+ 3. It is difficult to prepare an unbiased test data.
+
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+ 1. Numerical gradient checker only needs the forward operator.
+ 2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
+
+## Mathematical Theory
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
+
+
+## Numerical Gradient Implementation
+### Python Interface
+```python
+def get_numerical_gradient(op,
+ input_values,
+ output_name,
+ input_to_check,
+ delta=0.005,
+ local_scope=None):
+ """
+ Get Numerical Gradient for the input of an operator.
+
+ :param op: C++ operator instance, could be an network.
+ :param input_values: The input variables. Should be an dictionary, whose key is
+ variable name, and value is a numpy array.
+ :param output_name: The final output variable name.
+ :param input_to_check: The input variable with respect to which the gradient has to be computed.
+ :param delta: The perturbation value for numerical gradient method. The
+ smaller the delta, the more accurate the result. But if the delta is too
+ small, it will suffer from the numerical stability problem.
+ :param local_scope: The local scope used for get_numeric_gradient.
+ :return: The gradient array in numpy format.
+ """
+```
+
+### Explanation:
+
+- Why do we need an `output_name`
+ - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
+
+- Why do we need `input_to_check`
+ - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
+
+
+### Core Algorithm Implementation
+
+
+```python
+ # we only compute the gradient of one element a time.
+ # we use a for loop to compute the gradient of each element.
+ for i in xrange(tensor_size):
+ # get one input element using the index i.
+ original = tensor_to_check.get_float_element(i)
+
+ # add delta to it, run the forward op and then
+ # get the new value of the result tensor.
+ x_pos = original + delta
+ tensor_to_check.set_float_element(i, x_pos)
+ y_pos = get_output()
+
+ # Subtract delta from this element, run the op again
+ # and get the new value of the result tensor.
+ x_neg = original - delta
+ tensor_to_check.set_float_element(i, x_neg)
+ y_neg = get_output()
+
+ # restore old value
+ tensor_to_check.set_float_element(i, original)
+
+ # compute the gradient of this element and store
+ # it into a numpy array.
+ gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+ # reshape the gradient result to the shape of the source tensor.
+ return gradient_flat.reshape(tensor_to_check.get_dims())
+```
+
+## Auto Gradient Check Framework
+
+Each Operator Kernel has three kinds of Gradient:
+
+1. Numerical gradient
+2. CPU kernel gradient
+3. GPU kernel gradient (if supported by the device)
+
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
+
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
+
+#### Python Interface
+
+```python
+ def check_grad(self,
+ forward_op,
+ input_vars,
+ inputs_to_check,
+ output_name,
+ no_grad_set=None,
+ only_cpu=False,
+ max_relative_error=0.005):
+ """
+ :param forward_op: used to create backward_op
+ :param input_vars: numpy value of input variable. The following
+ computation will use these variables.
+ :param inputs_to_check: the input variable with respect to which the
+ gradient will be computed.
+ :param output_name: The final output variable name.
+ :param max_relative_error: The relative tolerance parameter.
+ :param no_grad_set: used to create backward ops
+ :param only_cpu: only compute and check gradient on cpu kernel.
+ :return:
+ """
+```
+
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
+
+```python
+numerical_grad = ...
+operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+
+abs_numerical_grad = numpy.abs(numerical_grad)
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
+abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
+
+diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
+max_diff = numpy.max(diff_mat)
+```
+
+
+#### Notes:
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
+
+
+#### References:
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/dcgan.png b/doc/fluid/design/others/dcgan.png
similarity index 100%
rename from doc/design/dcgan.png
rename to doc/fluid/design/others/dcgan.png
diff --git a/doc/design/gan_api.md b/doc/fluid/design/others/gan_api.md
similarity index 100%
rename from doc/design/gan_api.md
rename to doc/fluid/design/others/gan_api.md
diff --git a/doc/design/graph.md b/doc/fluid/design/others/graph.md
similarity index 100%
rename from doc/design/graph.md
rename to doc/fluid/design/others/graph.md
diff --git a/doc/design/graph_survey.md b/doc/fluid/design/others/graph_survey.md
similarity index 100%
rename from doc/design/graph_survey.md
rename to doc/fluid/design/others/graph_survey.md
diff --git a/doc/design/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash
similarity index 100%
rename from doc/design/images/graph_construction_example.bash
rename to doc/fluid/design/others/images/graph_construction_example.bash
diff --git a/doc/design/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot
similarity index 100%
rename from doc/design/images/graph_construction_example.dot
rename to doc/fluid/design/others/images/graph_construction_example.dot
diff --git a/doc/design/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png
similarity index 100%
rename from doc/design/images/graph_construction_example_all.png
rename to doc/fluid/design/others/images/graph_construction_example_all.png
diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
similarity index 100%
rename from doc/design/images/graph_construction_example_forward_backward.png
rename to doc/fluid/design/others/images/graph_construction_example_forward_backward.png
diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
similarity index 100%
rename from doc/design/images/graph_construction_example_forward_only.png
rename to doc/fluid/design/others/images/graph_construction_example_forward_only.png
diff --git a/doc/design/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md
similarity index 100%
rename from doc/design/parameters_in_cpp.md
rename to doc/fluid/design/others/parameters_in_cpp.md
diff --git a/doc/design/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md
similarity index 100%
rename from doc/design/simple_op_design.md
rename to doc/fluid/design/others/simple_op_design.md
diff --git a/doc/design/test.dot b/doc/fluid/design/others/test.dot
similarity index 100%
rename from doc/design/test.dot
rename to doc/fluid/design/others/test.dot
diff --git a/doc/design/test.dot.png b/doc/fluid/design/others/test.dot.png
similarity index 100%
rename from doc/design/test.dot.png
rename to doc/fluid/design/others/test.dot.png
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
new file mode 100644
index 0000000000..5596b2653a
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -0,0 +1,220 @@
+# API注释撰写标准
+
+- [API注释模块](#API注释模块)
+- [格式及示例](#格式及示例)
+- [完整示例](#完整示例)
+
+
+## API注释模块
+
+API文档须包含以下几个模块(排列顺序为文档撰写顺序):
+
+- Python API Definition
+
+ API的代码定义。
+
+- Function Description
+
+ API的功能描述。描述该API的含义、作用或对输入所做的操作,及参考文献和对应链接(如果有),必要时给出公式,并解释公式中关键变量的含义。
+
+- Args Description
+
+ API参数介绍。按代码定义中的参数顺序逐个介绍,介绍内容包含数据类型、默认值(如果有)、含义等。
+
+- Returns
+
+ API返回值介绍。介绍返回值含义,必要时给出对应的形状。若返回值为包含多个参数的tuple,则按顺序逐个介绍各参数。
+
+- Raises(如果有)
+
+ 可能抛出的异常或错误及可能的产生原因,当可能抛出多种异常或错误时应分条列出。
+
+- Note(如果有)
+
+ 注意事项。当有多条注意事项时,应分条列出。
+
+- Examples
+
+ API的使用示例。
+
+
+## 格式及示例
+
+API文档须使用reStructuredText格式撰写,该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下(以下以fc为例进行说明):
+
+- Python API Definition
+
+ - 格式:
+
+ [Python API Definition]
+
+ - 示例
+
+ ```
+ fc(input,
+ size,
+ num_flatten_dims=1,
+ param_attr=None,
+ bias_attr=None,
+ act=None,
+ name=None,
+ main_program=None,
+ startup_program=None)
+ ```
+
+- Function Description
+
+ - 格式
+
+ 本模块应包含以下内容(排列顺序为文档撰写顺序):
+
+ [Function Description]
+
+ [Formula]
+
+ [Symbols' Descriptions if necessary]
+
+ [References if necessary]
+
+ - 示例
+
+ [Function Description]
+
+ ```
+ **Fully Connected Layer**
+
+ The fully connected layer can take multiple tensors as its inputs. It
+ creates a variable called weights for each input tensor, which represents
+ a fully connected weight matrix from each input unit to each output unit.
+ The fully connected layer multiplies each input tensor with its coresponding
+ weight to produce an output Tensor. If multiple input tensors are given,
+ the results of multiple multiplications will be sumed up. If bias_attr is
+ not None, a bias variable will be created and added to the output. Finally,
+ if activation is not None, it will be applied to the output as well.
+ ```
+
+ [Formula]
+
+ ```
+ This process can be formulated as follows:
+
+ .. math::
+
+ Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+ ```
+
+ [Symbols' Descriptions if necessary]
+
+ ```
+ In the above equation:
+
+ * :math:`N`: Number of the input.
+ * :math:`X_i`: The input tensor.
+ * :math:`W`: The weights created by this layer.
+ * :math:`b`: The bias parameter created by this layer (if needed).
+ * :math:`Act`: The activation function.
+ * :math:`Out`: The output tensor.
+ ```
+
+ [References if necessary]
+
+ 因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例:
+
+ ```
+ Refer to `Layer Normalization `_ for more details.
+ ```
+
+
+- Args Description
+
+ - 格式
+
+ \[Arg's Name\][(Data Type, Default Value)][Description]
+
+ - 示例
+
+ fc的部分参数注释如下:
+
+ ```
+ Args:
+ input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+ the input tensor(s) is at least 2.
+ param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+ parameters/weights of this layer.
+ name (str, default None): The name of this layer.
+ ```
+
+- Returns
+
+ - 格式
+
+ [Name][Shape]
+
+ - 示例
+
+ ```
+ Returns:
+ A tensor variable storing the transformation result.
+ ```
+
+ 当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例:
+
+ ```
+ Returns:
+ A tuple containing:
+ The hidden state of LSTM whose shape is (T X D).
+ The cell state of LSTM whose shape is (T X D).
+ ```
+
+- Raises
+
+ - 格式
+
+ [Exception Type][Condition]
+
+ - 示例
+
+ ```
+ Raises:
+ ValueError: If the rank of the input is less than 2.
+ ```
+
+- Note
+
+ - 格式
+
+ [Note]
+
+ - 示例
+
+ fc没有注意事项,故该模块省略不写。如有注意事项应明确给出,当有多条注意事项,须分条列出,以scaled\_dot\_product\_attention为例:
+
+ ```
+ Note:
+ 1. When num_heads > 1, three linear projections are learned respectively
+ to map input queries, keys and values into queries', keys' and values'.
+ queries', keys' and values' have the same shapes with queries, keys
+ and values.
+ 2. When num_heads == 1, scaled_dot_product_attention has no learnable
+ parameters.
+ ```
+
+- Examples
+
+ - 格式
+
+ \[Python Code Snipper]
+
+ - 示例
+
+ ```
+ Examples:
+ .. code-block:: python
+
+ data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+ fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+ ```
+
+## 完整示例
+
+fc 的完整注释见[示例](src/fc.py)。
diff --git a/doc/fluid/dev/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png
new file mode 100644
index 0000000000..232762b82a
Binary files /dev/null and b/doc/fluid/dev/ci_build_whl.png differ
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
new file mode 100644
index 0000000000..e1edf079fa
--- /dev/null
+++ b/doc/fluid/dev/index_cn.rst
@@ -0,0 +1,2 @@
+开发标准
+------------
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
new file mode 100644
index 0000000000..faf9dfcd31
--- /dev/null
+++ b/doc/fluid/dev/index_en.rst
@@ -0,0 +1,4 @@
+Development
+------------
+
+This is Development page
diff --git a/paddle/operators/name_convention.md b/doc/fluid/dev/name_convention.md
similarity index 96%
rename from paddle/operators/name_convention.md
rename to doc/fluid/dev/name_convention.md
index b5cb176e00..a02b356f05 100644
--- a/paddle/operators/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
@@ -35,8 +35,8 @@ Here we give some examples to show how these rules will be used.
```c++
class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
public:
- AccumulateOpMaker(framework::OpProto *proto,
- framework::OpAttrChecker *op_checker)
+ AccumulateOpMaker(OpProto *proto,
+ OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
If the output size is not the same as input size,
diff --git a/doc/howto/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
similarity index 98%
rename from doc/howto/dev/new_op_cn.md
rename to doc/fluid/dev/new_op_cn.md
index 757a5840bc..9299658567 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -24,7 +24,7 @@
- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
-依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorBase`,后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
+依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
内容 | 定义位置
@@ -53,7 +53,7 @@ Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU
```cpp
class MulOpMaker : public framework::OpProtoAndCheckerMaker {
public:
- MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+ MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor), 2D tensor of size (M x K)");
AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -82,7 +82,7 @@ The equation is: Out = X * Y
template
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
- ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+ ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of scale operator.").NotInGradient();
AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
diff --git a/doc/howto/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
similarity index 93%
rename from doc/howto/dev/new_op_en.md
rename to doc/fluid/dev/new_op_en.md
index fe86936bc1..da8b1bdd10 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -4,7 +4,8 @@
- [Implementing C++ Types](#implementing-c-types)
- [Defining ProtoMaker](#defining-protomaker)
- [Defining Operator](#defining-operator)
- - [Registering Operator](#registering-operator)
+ - [Defining OpKernel](#defining-opkernel)
+ - [Registering Operator and OpKernel](#registering-operator-and-opkernel)
- [Compilation](#compilation)
- [Python Binding](#python-binding)
- [Unit Tests](#unit-tests)
@@ -16,12 +17,13 @@
Here are the base types needed. For details, please refer to the design docs.
-- `framework::OperatorBase`: Operator (Op)base class.
-- `framework::OpKernel`: Base class for Op computation.
-- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation kernel.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels.
+
-An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
Information | Where is it defined
@@ -32,7 +34,7 @@ Kernel implementation | The kernel methods shared between CPU and CUDA are
Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
@@ -50,7 +52,7 @@ First, define `ProtoMaker` to describe the Operator's input, output, and additio
```cpp
class MulOpMaker : public framework::OpProtoAndCheckerMaker {
public:
- MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+ MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor), 2D tensor of size (M x K)");
AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -79,7 +81,7 @@ An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/de
template
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
- ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+ ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of scale operator.").NotInGradient();
AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
@@ -156,7 +158,8 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
- `typename T` denotes data type, such as `float` or `double`.
`MulKernel` types need to rewrite the interface for `Compute`.
-- `Compute` takes one input variable `const framework::ExecutionContext& context`.
+
+- `Compute` takes one input parameter: `const framework::ExecutionContext& context`.
- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
- `Compute` implements the computation logics of an `OpKernel`.
@@ -177,7 +180,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
};
```
-Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
@@ -188,13 +191,14 @@ This concludes the forward implementation of an operator. Next its operation and
The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
-### Registering Operator
+### Registering Operator and OpKernel
- In `.cc` files, register forward and backward operator classes and the CPU kernel.
```cpp
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+
REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
REGISTER_OP_CPU_KERNEL(mul_grad,
ops::MulGradKernel);
@@ -204,6 +208,7 @@ The definition of its corresponding backward operator, if applicable, is similar
- `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
- `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+
- `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
@@ -225,6 +230,7 @@ The definition of its corresponding backward operator, if applicable, is similar
Run the following commands to compile.
```
+# maybe you need to rerun cmake
make mul_op
```
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel_en.md
new file mode 100644
index 0000000000..123df0a7ee
--- /dev/null
+++ b/doc/fluid/dev/new_op_kernel_en.md
@@ -0,0 +1,121 @@
+## Add Kernels for a New Device
+
+### Background
+
+PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+
+### Write Kernels for A New Device
+
+#### Add A New Device
+
+ For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24). We will correct this ASAP.
+
+To register a new device, we need to add an enum value to `LibraryType`:
+
+```
+enum class LibraryType {
+ kPlain = 0,
+ kMKLDNN = 1,
+ kCUDNN = 2,
+};
+```
+
+
+#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+
+```cpp
+struct CUDAPlace {
+ CUDAPlace() : CUDAPlace(0) {}
+ explicit CUDAPlace(int d) : device(d) {}
+
+ inline int GetDeviceId() const { return device; }
+ // needed for variant equality comparison
+ inline bool operator==(const CUDAPlace &o) const {
+ return device == o.device;
+ }
+ inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+ int device;
+};
+
+typedef boost::variant Place;
+```
+
+#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+
+```cpp
+class DeviceContext {
+ public:
+ virtual ~DeviceContext() {}
+ virtual Place GetPlace() const = 0;
+
+ virtual void Wait() const {}
+};
+```
+
+#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+
+```cpp
+class OpKernelBase {
+ public:
+ /**
+ * ExecutionContext is the only parameter of Kernel Run function.
+ * Run will get input/output variables, state such as momentum and
+ * device resource such as CUDA stream, cublas handle, etc. from
+ * ExecutionContext. User should construct it before run the Operator.
+ */
+
+ virtual void Compute(const ExecutionContext& context) const = 0;
+
+ virtual ~OpKernelBase() = default;
+};
+
+template
+class OpKernel : public OpKernelBase {
+ public:
+ using ELEMENT_TYPE = T;
+};
+```
+
+
+#### Register the OpKernel to framework
+
+After writing the components described above, we should register the kernel to the framework.
+
+We use `REGISTER_OP_KERNEL` to do the registration.
+
+```cpp
+REGISTER_OP_KERNEL(
+ op_type,
+ library_type,
+ place_type,
+ kernel0, kernel1, ...)
+```
+
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+
+ ```cpp
+ REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+ paddle::operators::GemmConvKernel,
+ paddle::operators::GemmConvKernel);
+
+ REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+ paddle::operators::CUDNNConvOpKernel,
+ paddle::operators::CUDNNConvOpKernel);
+ ```
+
+In the code above:
+
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel` is `data_type`.
diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md
new file mode 100644
index 0000000000..0ee804d592
--- /dev/null
+++ b/doc/fluid/dev/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+# PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+# Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+# Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+# LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+# The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/doc/design/releasing_process.md b/doc/fluid/dev/releasing_process.md
similarity index 68%
rename from doc/design/releasing_process.md
rename to doc/fluid/dev/releasing_process.md
index 14c081ea84..b978726109 100644
--- a/doc/design/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -7,11 +7,9 @@ PaddlePaddle每次发新的版本,遵循以下流程:
1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
1. 对这个版本的提交,做如下几个操作:
+ * 使用Regression Test List作为检查列表,测试本次release的正确性。
+ * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
- * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
- * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
- * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
- * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步
* 编译这个版本的python wheel包,并发布到pypi。
* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。
* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。
@@ -21,8 +19,8 @@ PaddlePaddle每次发新的版本,遵循以下流程:
pip install twine
twine upload dist/[package to upload]
```
+ * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
1. 协同完成Release Note的书写
@@ -31,6 +29,30 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+## 发布wheel包到pypi
+
+使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
+弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后
+可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。然后按照上述的方法
+使用`twine`工具上传即可。
+
+
+
+* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+ 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可:
+
+1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
+1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。
+1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
+1. 执行 `docker push paddlepaddle/paddle:[version]`
+
## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
diff --git a/doc/fluid/dev/src/fc.py b/doc/fluid/dev/src/fc.py
new file mode 100644
index 0000000000..3b074821cc
--- /dev/null
+++ b/doc/fluid/dev/src/fc.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def fc(input,
+ size,
+ num_flatten_dims=1,
+ param_attr=None,
+ bias_attr=None,
+ act=None,
+ name=None):
+ """
+ **Fully Connected Layer**
+
+ The fully connected layer can take multiple tensors as its inputs. It
+ creates a variable called weights for each input tensor, which represents
+ a fully connected weight matrix from each input unit to each output unit.
+ The fully connected layer multiplies each input tensor with its coresponding
+ weight to produce an output Tensor. If multiple input tensors are given,
+ the results of multiple multiplications will be sumed up. If bias_attr is
+ not None, a bias variable will be created and added to the output. Finally,
+ if activation is not None, it will be applied to the output as well.
+
+ This process can be formulated as follows:
+
+ .. math::
+
+ Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+ In the above equation:
+
+ * :math:`N`: Number of the input.
+ * :math:`X_i`: The input tensor.
+ * :math:`W`: The weights created by this layer.
+ * :math:`b`: The bias parameter created by this layer (if needed).
+ * :math:`Act`: The activation function.
+ * :math:`Out`: The output tensor.
+
+ Args:
+ input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+ the input tensor(s) is at least 2.
+ size(int): The number of output units in this layer.
+ num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+ two dimensions. If this happens, the multidimensional tensor will first be flattened
+ into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+ tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+ dimensions will be flatten to form the first dimension of the final matrix (height of
+ the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+ form the second dimension of the final matrix (width of the matrix). For example, suppose
+ `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+ Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+ param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+ parameters/weights of this layer.
+ bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+ of this layer. If it is set to None, no bias will be added to the output units.
+ act (str, default None): Activation to be applied to the output of this layer.
+ name (str, default None): The name of this layer.
+
+ Returns:
+ A tensor variable storing the transformation result.
+
+ Raises:
+ ValueError: If rank of the input tensor is less than 2.
+
+ Examples:
+ .. code-block:: python
+
+ data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+ fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+ """
diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md
new file mode 100644
index 0000000000..8983df9004
--- /dev/null
+++ b/doc/fluid/dev/support_new_device.md
@@ -0,0 +1,240 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicate the device id and manage hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+
+```
+ | CPUPlace
+Place --| CUDAPlace
+ | FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+ /-> CPUDeviceContext
+DeviceContext ----> CUDADeviceContext
+ \-> FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+ virtual Place GetPlace() const = 0;
+};
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+ Place GetPlace() const override { return place_; }
+private:
+ CUDAPlace place_;
+ cudaStream_t stream_;
+ cublasHandle_t cublas_handle_;
+ std::unique_ptr eigen_device_; // binds with stream_
+};
+```
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+
+```
+template
+void* Alloc(Place place, size_t size);
+
+template
+void Free(Place place, void* ptr);
+
+template
+size_t Used(Place place);
+```
+
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+ /*! Return a pointer to mutable memory block. */
+ template
+ inline T* data();
+
+ /**
+ * @brief Return a pointer to mutable memory block.
+ * @note If not exist, then allocation.
+ */
+ template
+ inline T* mutable_data(platform::Place place);
+
+ /**
+ * @brief Return a pointer to mutable memory block.
+ *
+ * @param[in] dims The dimensions of the memory block.
+ * @param[in] place The place of the memory block.
+ *
+ * @note If not exist, then allocation.
+ */
+ template
+ inline T* mutable_data(DDim dims, platform::Place place);
+
+ /*! Resize the dimensions of the memory block. */
+ inline Tensor& Resize(const DDim& dims);
+
+ /*! Return the dimensions of the memory block. */
+ inline const DDim& dims() const;
+
+ private:
+ /*! holds the memory block if allocated. */
+ std::shared_ptr holder_;
+
+ /*! points to dimensions of memory block. */
+ DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in the header file.
+
+```
+template
+class MaxOutFunctor {
+ public:
+ void operator()(const DeviceContext& context, const framework::Tensor& input,
+ framework::Tensor* output, int groups);
+};
+```
+
+CPU implementation is in .cc file
+
+```
+template
+class MaxOutFunctor {
+ public:
+ void operator()(const platform::CPUDeviceContext& context,
+ const framework::Tensor& input, framework::Tensor* output,
+ int groups) {
+ ...
+ }
+};
+```
+
+CUDA implementation is in .cu file
+
+```
+template
+class MaxOutFunctor {
+ public:
+ void operator()(const platform::CUDADeviceContext& context,
+ const framework::Tensor& input, framework::Tensor* output,
+ int groups) {
+ ...
+ }
+};
+```
+
+
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
+
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel);
+REGISTER_OP_CPU_KERNEL(
+ crop_grad, ops::CropGradKernel);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel);
+REGISTER_OP_CUDA_KERNEL(
+ crop_grad, ops::CropGradKernel);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+For more details, please refer to following docs:
+
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
similarity index 95%
rename from doc/howto/dev/use_eigen_cn.md
rename to doc/fluid/dev/use_eigen_cn.md
index 1367323b71..f36843b440 100644
--- a/doc/howto/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
### paddle::framework::Tensor到EigenTensor的转换
-如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
+如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
以EigenTensor为例,做一个介绍
@@ -125,7 +125,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework
在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。
-更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。
+更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)。
diff --git a/doc/howto/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md
similarity index 95%
rename from doc/howto/dev/use_eigen_en.md
rename to doc/fluid/dev/use_eigen_en.md
index e169106e12..3a466f73d1 100644
--- a/doc/howto/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
### paddle::framework::Tensor到EigenTensor的转换
-As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
+As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
Using EigenTensor as an example:
@@ -125,7 +125,7 @@ EigenTensor::Type et = EigenTensor::From(t);
In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
-For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
+For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file.
diff --git a/doc/fluid/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst
new file mode 100644
index 0000000000..395c110989
--- /dev/null
+++ b/doc/fluid/faq/index_cn.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst
new file mode 100644
index 0000000000..395c110989
--- /dev/null
+++ b/doc/fluid/faq/index_en.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/design/reader/README.md b/doc/fluid/getstarted/concepts/reader/README.md
similarity index 100%
rename from doc/design/reader/README.md
rename to doc/fluid/getstarted/concepts/reader/README.md
diff --git a/doc/design/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
similarity index 100%
rename from doc/design/model_format.md
rename to doc/fluid/getstarted/concepts/save_model/model_format.md
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
new file mode 100644
index 0000000000..c4d8525f23
--- /dev/null
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -0,0 +1,4 @@
+新手入门
+------------
+
+新手入门
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
new file mode 100644
index 0000000000..a4efd05e2f
--- /dev/null
+++ b/doc/fluid/getstarted/index_en.rst
@@ -0,0 +1,4 @@
+GET STARTED
+------------
+
+This is get started page
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
new file mode 100644
index 0000000000..1b6f767869
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -0,0 +1,145 @@
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行,以及将单机训练脚本改造成支持集群训练的版本
+
+## 准备工作
+* 可用的集群
+
+ 包含一个或多个计算节点的集群,每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址,集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+
+ 所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+
+ **注意:**当前对外提供的PaddlePaddle版本并不支持分布式,需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+ cmake编译命令中需要将WITH_DISTRIBUTE设置为ON,下面是一个cmake编译指令示例:
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+## 更新训练脚本
+这里,我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例,描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.uci_housing.train(), buf_size=500),
+ batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+ fluid.io.save_persistables(exe, "./fit_a_line.model/")
+ fluid.io.load_persistables(exe, "./fit_a_line.model/")
+ for data in train_reader():
+ avg_loss_value, = exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+
+ if avg_loss_value[0] < 10.0:
+ exit(0) # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+我们创建了一个简单的全连接神经网络程序,并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中,只存在Trainer一种角色,它不仅处理常规的计算任务,也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中,由于存在多个Trainer节点进行同样的数据计算任务,因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中,我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+
+**因此,在分布式的Fluid环境中,我们有两个角色需要创建,分别是Parameter Server和Trainer。**
+
+### 分布式训练
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数,将他们分隔为两部分,通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下:
+```python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+ for data in train_reader():
+ exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+
+| 参数名 | 值类型 | 说明 | 示例 |
+|:-------------|:------|:---------------------------------------|:-------------|
+| trainer_id | int | 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 | 0/1/2/3 |
+| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
+| trainers | int | 训练节点的总个数,>0的数字 | 4 |
+| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
+| training_role | str | 节点角色, TRAINER/PSERVER | PSERVER |
+
+**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下:
+
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+ optimize_ops,
+ params_grads,
+ trainer_id,
+ pservers=pserver,
+ trainers=trainers)
+if training_role == "PSERVER":
+ pserver_prog = t.get_pserver_program(server_endpoint)
+ pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+
+第一步,进入demo代码所在目录:
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+
+第二步,启动Parameter Server:
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+
+第三步,启动Trainer:
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
+
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_en.md b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
new file mode 100644
index 0000000000..b4465e8269
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
@@ -0,0 +1,153 @@
+# Fluid Distributed Training
+
+## Introduction
+
+In this article, we'll explain how to configure and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster.
+
+## Preparations
+
+### Getting the cluster ready
+
+Prepare the compute nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate to each other.
+
+### Have PaddlePaddle installed
+
+PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries.
+
+PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
+
+In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
+
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+### Update the training script
+
+#### Non-cluster training script
+
+Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example.
+
+The non-cluster version of this demo with fluid API is as follows:
+
+``` python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.uci_housing.train(), buf_size=500),
+ batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+ fluid.io.save_persistables(exe, "./fit_a_line.model/")
+ fluid.io.load_persistables(exe, "./fit_a_line.model/")
+ for data in train_reader():
+ avg_loss_value, = exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+
+ if avg_loss_value[0] < 10.0:
+ exit(0) # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+We created a simple fully-connected neural network training program and handed it to the fluid executor to run for 100 passes.
+
+Now let's try to convert it to a distributed version to run on a cluster.
+
+#### Introducing parameter server
+
+As we can see from the non-cluster version of training script, there is only one role in the script: the trainer, that performs the computing as well as holds the parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle.
+
+
+
+Parameter Server in fluid not only holds the parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more technical details, please refer to [this document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md).
+
+Now we need to create programs for both: trainers and parameter servers, the question is how?
+
+#### Slice the program
+
+Fluid provides a tool called "Distributed Transpiler" that automatically converts the non-cluster program into cluster program.
+
+The idea behind this tool is to find the optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP.
+
+Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function.
+
+To put them together:
+
+``` python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+ for data in train_reader():
+ exe.run(t.get_trainer_program())
+
+
+```
+
+### E2E demo
+
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+First `cd` into the folder that contains the `python` files. In this case:
+
+```bash
+cd /paddle/python/paddle/fluid/tests/book_distribute
+```
+
+In parameter server node run the following in the command line:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
+```
+
+*please note we assume that your parameter server runs at 192.168.1.2:6174*
+
+Wait until the prompt `Server listening on 192.168.1.2:6174`
+
+Then in 2 of your trainer nodes run this:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
+```
+
+*the reason you need to run this command twice in 2 nodes is because: in the script we set the trainer count to be 2. You can change this setting on line 50*
+
+Now you have 2 trainers and 1 parameter server up and running.
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
new file mode 100644
index 0000000000..a92abad0c5
--- /dev/null
+++ b/doc/fluid/howto/index_cn.rst
@@ -0,0 +1,2 @@
+进阶使用
+------------
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
new file mode 100644
index 0000000000..06036bdce5
--- /dev/null
+++ b/doc/fluid/howto/index_en.rst
@@ -0,0 +1,4 @@
+HOW TO
+------------
+
+This is how to page
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
similarity index 90%
rename from doc/howto/optimization/cpu_profiling_cn.md
rename to doc/fluid/howto/optimization/cpu_profiling_cn.md
index 14eba0e2f3..d59be670c2 100644
--- a/doc/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -35,7 +35,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
```
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.284 0.284 29.514 29.514 main.py:1()
- 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+ 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
4696 12.040 0.003 12.040 0.003 {built-in method run}
1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14()
```
@@ -61,9 +61,9 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
```text
4696 12.040 0.003 12.040 0.003 {built-in method run}
300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
- 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
- 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
- 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1()
+ 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+ 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+ 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1()
```
可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。
@@ -76,9 +76,9 @@ Called By:
Function was called by...
ncalls tottime cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
- 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+ 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
Called:
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
similarity index 88%
rename from doc/howto/optimization/cpu_profiling.md
rename to doc/fluid/howto/optimization/cpu_profiling_en.md
index 1775374cf6..01e5fddf61 100644
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -49,7 +49,7 @@ port, we will see the output like the following:
```
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.284 0.284 29.514 29.514 main.py:1()
- 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+ 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
4696 12.040 0.003 12.040 0.003 {built-in method run}
1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14()
```
@@ -60,8 +60,7 @@ each column is as follows:
| column | meaning |
| --- | --- |
| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the
- execution time of other functions called by the function |
+| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
| percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls |
@@ -75,9 +74,9 @@ focus on. We can sort above profiling file by tottime:
```text
4696 12.040 0.003 12.040 0.003 {built-in method run}
300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
- 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
- 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
- 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1()
+ 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+ 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+ 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1()
```
We can see that the most time-consuming function is the `built-in
@@ -94,9 +93,9 @@ Called By:
Function was called by...
ncalls tottime cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
- 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+ 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
Called:
diff --git a/doc/howto/optimization/pprof_1.png b/doc/fluid/howto/optimization/pprof_1.png
similarity index 100%
rename from doc/howto/optimization/pprof_1.png
rename to doc/fluid/howto/optimization/pprof_1.png
diff --git a/doc/howto/optimization/pprof_2.png b/doc/fluid/howto/optimization/pprof_2.png
similarity index 100%
rename from doc/howto/optimization/pprof_2.png
rename to doc/fluid/howto/optimization/pprof_2.png
diff --git a/doc/fluid/howto/optimization/timeline.jpeg b/doc/fluid/howto/optimization/timeline.jpeg
new file mode 100644
index 0000000000..38ec3f80c9
Binary files /dev/null and b/doc/fluid/howto/optimization/timeline.jpeg differ
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md
new file mode 100644
index 0000000000..9d9565a3e6
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline.md
@@ -0,0 +1,27 @@
+## how to use timeline tool to do profile
+
+1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+
+ ```python
+ with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+ for pass_id in range(pass_num):
+ for batch_id, data in enumerate(train_reader()):
+ exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[],
+ use_program_cache=True)
+ ...
+ ```
+
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+
+1. Open chrome and visit , use `load` button to load the generated `timeline` file.
+
+ 
+
+1. The resulting timeline should be like:
+
+
+ 
diff --git a/doc/fluid/howto/optimization/tracing.jpeg b/doc/fluid/howto/optimization/tracing.jpeg
new file mode 100644
index 0000000000..3a49fc4f8a
Binary files /dev/null and b/doc/fluid/howto/optimization/tracing.jpeg differ
diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md
new file mode 100644
index 0000000000..58aa73b8cd
--- /dev/null
+++ b/doc/fluid/howto/performance/error_clip.md
@@ -0,0 +1,92 @@
+# Error Clip
+
+## Overview
+
+Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
+## Usage
+
+Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
+
+```python
+var = framework.Variable(..., error_clip=myErrorClip, ...)
+```
+
+The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
+
+```python
+ErrorClipByValue(max, min=None)
+```
+
+`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
+
+So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
+
+```python
+var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+```
+
+## Implementation
+
+The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
+
+```python
+class BaseErrorClipAttr(object):
+ def append_clip_op(self, block, grad_name):
+ raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+ def __init__(self, max, min=None):
+ max = float(max)
+ if min is None:
+ min = -max
+ else:
+ min = float(min)
+ self.max = max
+ self.min = min
+
+ def append_clip_op(self, block, grad_name):
+ clip_op_desc = block.desc.append_op()
+ clip_op_desc.set_type("clip")
+ clip_op_desc.set_input("X", [grad_name])
+ clip_op_desc.set_output("Out", [grad_name])
+ clip_op_desc.set_attr("min", self.min)
+ clip_op_desc.set_attr("max", self.max)
+```
+
+The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
+
+This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
+
+These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
+
+```python
+for op_desc in grad_op_descs:
+ new_op_desc = target_block.desc.append_op()
+ new_op_desc.copy_from(op_desc)
+ callback(block=target_block, context=grad_to_var)
+```
+
+Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
+
+The callback function for `clip_op` appending is defined in *clip.py*:
+
+```python
+def error_clip_callback(block, context):
+ # the context is a grad_to_var map
+ grad_to_var = context
+ op_desc = block.desc.op(block.desc.op_size() - 1)
+ for grad_n in filter(lambda n: grad_to_var.has_key(n),
+ op_desc.output_arg_names()):
+ fwd_var = block.var_recursive(grad_to_var[grad_n])
+ error_clip = getattr(fwd_var, "error_clip", None)
+ if not (error_clip is None or isinstance(error_clip,
+ BaseErrorClipAttr)):
+ raise TypeError(
+ "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+ )
+ if error_clip is not None:
+ error_clip.append_clip_op(block, grad_n)
+```
+
+This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
diff --git a/doc/fluid/howto/performance/images/profiler.png b/doc/fluid/howto/performance/images/profiler.png
new file mode 100644
index 0000000000..d57b71ca88
Binary files /dev/null and b/doc/fluid/howto/performance/images/profiler.png differ
diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md
new file mode 100644
index 0000000000..b20b5efdc1
--- /dev/null
+++ b/doc/fluid/howto/performance/profiler.md
@@ -0,0 +1,97 @@
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.
+
+```python
+for i in xrange(M): # M is the iteration number
+ for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+ op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program? There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.
+
+The overall flow is shown as the following figure.
+
+
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+ kMark,
+ kPushRange,
+ kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range.
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece.
+
+```c++
+class Event {
+ public:
+ // The DeviceContext is used to get current CUDA stream.
+ Event(EventKind kind, std::string name, uint32_t thread_id,
+ const platform::DeviceContext* dev_ctx = nullptr);
+ double CpuElapsedUs(const Event& e) const;
+ double CudaElapsedUs(const Event& e) const;
+
+ private:
+ EventKind kind_;
+ std::string name_;
+ uint32_t thread_id_;
+ int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+ cudaEvent_t event_ = nullptr;
+ int device_ = -1;
+#endif
+};
+
+struct EventList {
+ std::forward_list> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
+
+```c++
+enum ProfilerState {
+ kDisabled,
+ kCPU,
+ kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+ explicit RecordEvent(const std::string name,
+ platform::DeviceContext* dev_ctx = nullptr) {
+ if (kState == ProfilerState::kDisabled) return;
+ // push the starting event to the event lists.
+ }
+ ~RecordEvent() {
+ if (kState == ProfilerState::kDisabled) return;
+ // push the ending event to the event lists.
+ }
+};
+```
diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000..cb5bc420ce
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.png b/doc/fluid/howto/third_party/images/multigpu_allreduce.png
new file mode 100644
index 0000000000..87a1b3e8f6
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000..6c35ab1b21
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.png b/doc/fluid/howto/third_party/images/multigpu_before_convert.png
new file mode 100644
index 0000000000..9c8f771116
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/howto/third_party/mkldnn_fluid.md b/doc/fluid/howto/third_party/mkldnn_fluid.md
new file mode 100644
index 0000000000..bef126f3f0
--- /dev/null
+++ b/doc/fluid/howto/third_party/mkldnn_fluid.md
@@ -0,0 +1,149 @@
+# Design Doc: Add MKLDNN Kernel in Fluid Operator
+
+## Principles
+
+First of all, we should follow some basical principles like:
+1. [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
+2. [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
+3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
+4. [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
+
+## Sulution
+
+In general, there are four parts we should follow to run a MKL-DNN primitive.
+- Create a primitive descriptor that describe this operator
+- Create a primitive itself by primitive descriptor and the engine
+- Create all memory buffers that primitive needed
+- Launch a stream to execute the primitive created
+More details can refer to [here](http://01org.github.io/mkl-dnn).
+
+It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
+So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
+
+It's assumed that following three conditions should be satisfied.
+1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
+2. the `Input Tensor` inside `Compute` function is the one after converted.
+3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
+
+### Compute
+The algorithm of `Compute` would be described as follow, let's take conv like an example.
+
+```c++
+
+ PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
+ PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
+
+ auto& dev_ctx = ctx.template device_context();
+
+ // find primitive by unique key from mkldnn context
+ // the op_key should be a unique name of this op instance
+ auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+ // assuming the input tensor inside this compute function is the one after converted
+ // this point should be guarantee by another mechanism
+ auto& i = dev_ctx.findMemory(op_key + "_input");
+
+ if (p == nullptr || i == nullptr || inputSizeChanged(p, i)) {
+ auto fwd_primitive_desc = createPrimitiveDesc(ctx);
+ auto* input = ctx.Input("Input");
+ auto* filter = ctx.Input("Filter");
+ auto* output = ctx.Output("Output");
+ shared_ptr in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data()));
+ shared_ptr wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data()));
+ shared_ptr out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data(ctx.GetPlace())));
+ shared_ptr fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
+
+ dev_ctx.addMemory(op_key+"_input", in);
+ dev_ctx.addMemory(op_key+"_output", out);
+ dev_ctx.addMemory(op_key+"_filer", wgt);
+ dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
+ dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
+ }
+
+ p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+ PADDLE_ENFORCE(p, "Should have forward Primitive");
+ PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
+ PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
+ PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
+ PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
+ dev_ctx.submit(p);
+ dev_ctx.execute(); // the convert primitive should have already contained.
+
+```
+
+The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
+```c++
+ auto* input = ctx.Input("Input");
+ auto* filter = ctx.Input("Filter");
+ auto* output = ctx.Output("Output");
+ std::vector strides = ctx.Attr>("strides");
+ std::vector paddings = ctx.Attr>("paddings");
+ std::vector dilations = ctx.Attr>("dilations");
+ int groups = ctx.Attr("groups");
+ algorithm algo = static_cast(ctx.Attr("convolution_algorithm_option"));
+ prop_kind pk = ctx.Attr("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
+
+ auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
+ shared_ptr fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
+
+ return fwd_primitive_desc;
+ }
+```
+
+### MKLDNNDeviceContext
+`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
+
+
+### mkldnn_helper
+Some functions would be put in `paddle/platform/mkldnn_helper.h`.
+- create MKLDNN memories
+- create MKLDNN primitives
+- error check function
+- etc
+
+
+### Kernel Switch
+We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
+
+`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`.
+`trans` would be like this:
+
+```c++
+void trans(inputs, ctx) override {
+ if (NoNeedTrans()) {
+ return;
+ }
+ // find reorder primitive by op_key from context
+ auto& dev_ctx = ctx.template device_context();
+ auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+ auto& i = dev_ctx.findMemory(op_key + "_src_input");
+
+ if (p == nullptr || i == nullptr || changeSized(i, input)) {
+ auto prim = createPrimitiveDesc(ctx);
+ auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
+ auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
+ auto dst = createMemory(p->expected_desc(), newbuffer->data);
+ auto reorder_primitive(new mkldnn::reorder(src, dst));
+
+ dev_ctx.addMemory(op_key+"_src_input", src);
+ dev_ctx.addMemory(op_key+"_input", dst);
+ dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
+ }
+
+ p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+ PADDLE_ENFORCE(p, "Should have Reorder Primitive");
+ dev_ctx.submit(p);
+ if (! this->isMKLDNNKernel()) {
+ // execute immediately only if this is not mkldnn kernel function.
+ // otherwise, it can be executed with the operator primitive in Compute
+ dev_ctx.stream();
+ }
+ // after submit, the input tensor in ExecutionContext should be changed as the converted one
+ // there should be another mechanism to ensure this
+}
+```
+
+### Unit Test
+All the functions should be tested corresponding.
+TBD
diff --git a/doc/fluid/howto/third_party/paddle_nccl.md b/doc/fluid/howto/third_party/paddle_nccl.md
new file mode 100644
index 0000000000..c7dac70998
--- /dev/null
+++ b/doc/fluid/howto/third_party/paddle_nccl.md
@@ -0,0 +1,65 @@
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in paddle. We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel.
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards.
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+ *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+
+
+After compiling, the graph as shows
+
+
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc.
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+ Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
new file mode 100644
index 0000000000..be3bed4393
--- /dev/null
+++ b/doc/fluid/index_cn.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+.. toctree::
+ :maxdepth: 1
+
+ getstarted/index_cn.rst
+ design/index_cn.rst
+ build_and_install/index_cn.rst
+ howto/index_cn.rst
+ dev/index_cn.rst
+ faq/index_cn.rst
diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst
new file mode 100644
index 0000000000..87c831420a
--- /dev/null
+++ b/doc/fluid/index_en.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+.. toctree::
+ :maxdepth: 1
+
+ getstarted/index_en.rst
+ design/index_en.rst
+ build_and_install/index_en.rst
+ howto/index_en.rst
+ dev/index_en.rst
+ faq/index_en.rst
diff --git a/doc/fluid/read_source.md b/doc/fluid/read_source.md
new file mode 100644
index 0000000000..bb6d4563f5
--- /dev/null
+++ b/doc/fluid/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory
+
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
+ - Every Layer has one or more operators and variables/parameters
+ - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files:
+ - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h)
+ - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h)
+ - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+ - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
+ - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h)
+ - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)]
+ - Feeds the data: `feed=feeder.feed(data)`
+ - Evaluates all the operators
+ - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+ - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live
+ - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live
+ - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory)
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
deleted file mode 100644
index c875c807b8..0000000000
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ /dev/null
@@ -1,141 +0,0 @@
-从源码编译
-======================
-
-.. _build_step:
-
-编译方法
-----------------
-
-PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译工具。
-我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
-可以在 `这里 `_ 找到。
-
-如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
-
-编译PaddlePaddle,需要执行:
-
-.. code-block:: bash
-
- git clone https://github.com/PaddlePaddle/Paddle.git
- cd Paddle
- # 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
- # 如果不使用Docker编译环境,执行下面的命令
- mkdir build
- cd build
- cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
- make
-
-编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
-
-.. code-block:: bash
-
- pip install build/python/dist/*.whl
-
-
-.. _run_test:
-
-执行单元测试
-----------------
-
-如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
-
-使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
-开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
-
-.. code-block:: bash
-
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
-
-如果不使用Docker,可以执行ctest命令即可:
-
-.. code-block:: bash
-
- mkdir build
- cd build
- cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
- make
- ctest
- # 指定执行其中一个单元测试 test_mul_op
- ctest -R test_mul_op
-
-.. _compile_deps:
-
-编译依赖
-----------------
-
-PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。
-
-.. csv-table:: PaddlePaddle编译依赖
- :header: "依赖", "版本", "说明"
- :widths: 10, 15, 30
-
- "CMake", ">=3.5", ""
- "GCC", "4.8.2", "推荐使用CentOS的devtools2"
- "Python", "2.7.x", "依赖libpython2.7.so"
- "pip", ">=9.0", ""
- "numpy", "", ""
- "SWIG", ">=2.0", ""
- "Go", ">=1.8", "可选"
-
-
-.. _build_options:
-
-编译选项
-----------------
-
-PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。
-用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考
-`官方文档 `_ 。
-
-在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如:
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=OFF
-
-.. csv-table:: 编译选项说明
- :header: "选项", "说明", "默认值"
- :widths: 1, 7, 2
-
- "WITH_GPU", "是否支持GPU", "ON"
- "WITH_C_API", "是否仅编译CAPI", "OFF"
- "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
- "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON"
- "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
- "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
- "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
- "WITH_TESTING", "是否开启单元测试", "ON"
- "WITH_DOC", "是否编译中英文文档", "OFF"
- "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
- "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
- "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON"
-
-BLAS
-+++++
-
-PaddlePaddle支持 `MKL `_ 和
-`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集,
-还会下载MKL-DNN数学库,详细参考 `这里 `_ 。
-
-如果关闭MKL,则会使用OpenBLAS作为BLAS库。
-
-CUDA/cuDNN
-+++++++++++
-
-PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
-使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。
-
-PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。
-我们推荐使用最新版本的cuDNN。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
-
-**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。**
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
deleted file mode 100644
index f194f84ce7..0000000000
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ /dev/null
@@ -1,159 +0,0 @@
-Build from Sources
-==========================
-
-.. _build_step:
-
-How To Build
-----------------
-
-PaddlePaddle mainly uses `CMake `_ and GCC, G++ as compile
-tools. We recommend you to use our pre-built Docker image to run the build
-to avoid installing dependencies by yourself. We have several build environment
-Docker images `here `_ .
-
-If you choose not to use Docker image for your build, you need to install the
-below `Compile Dependencies`_ before run the build.
-
-Then run:
-
-.. code-block:: bash
-
- git clone https://github.com/PaddlePaddle/Paddle.git
- cd Paddle
- # run the following command to build a CPU-Only binaries if you are using docker
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
- # else run these commands
- mkdir build
- cd build
- cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
- make
-
-When the compile finishes, you can get the output whl package under
-build/python/dist, then you can choose to install the whl on local
-machine or copy it to the target machine.
-
-.. code-block:: bash
-
- pip install build/python/dist/*.whl
-
-
-.. _run_test:
-
-Run Tests
-----------------
-
-If you wish to run the tests, you may follow the below steps:
-
-When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
-Set :code:`WITH_GPU=ON` Can also run tests on GPU.
-
-.. code-block:: bash
-
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
-
-If you don't use Docker, just run ctest will start the tests:
-
-.. code-block:: bash
-
- mkdir build
- cd build
- cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
- make
- ctest
- # run a single test like test_mul_op
- ctest -R test_mul_op
-
-
-.. _compile_deps:
-
-Compile Dependencies
-----------------
-
-PaddlePaddle need the following dependencies when compiling, other dependencies
-will be downloaded automatically.
-
-.. csv-table:: PaddlePaddle Compile Dependencies
- :header: "Dependency", "Version", "Description"
- :widths: 10, 15, 30
-
- "CMake", ">=3.5", ""
- "GCC", "4.8.2", "Recommend devtools2 for CentOS"
- "Python", "2.7.x", "Need libpython2.7.so"
- "pip", ">=9.0", ""
- "numpy", "", ""
- "SWIG", ">=2.0", ""
- "Go", ">=1.8", "Optional"
-
-
-.. _build_options:
-
-Build Options
-----------------
-
-Build options include whether build binaries for CPU or GPU, which BLAS
-library to use etc. You may pass these settings when running cmake.
-For detailed cmake tutorial please refer to `here `_ 。
-
-.. _build_options_bool:
-
-Bool Type Options
-----------------
-
-You can add :code:`-D` argument to pass such options, like:
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=OFF
-
-.. csv-table:: Bool Type Options
- :header: "Option", "Description", "Default"
- :widths: 1, 7, 2
-
- "WITH_GPU", "Build with GPU support", "ON"
- "WITH_C_API", "Build only CAPI", "OFF"
- "WITH_DOUBLE", "Build with double precision", "OFF"
- "WITH_DSO", "Dynamically load CUDA libraries", "ON"
- "WITH_AVX", "Build with AVX support", "ON"
- "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
- "WITH_STYLE_CHECK", "Check code style when building", "ON"
- "WITH_TESTING", "Build unit tests", "ON"
- "WITH_DOC", "Build documentations", "OFF"
- "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
- "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
- "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
-
-
-BLAS
-+++++
-
-PaddlePaddle supports `MKL `_ and
-`OpenBlAS `_ as BLAS library。By default it uses MKL.
-If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
-and used, for more `details `_ .
-
-If you choose not to use MKL, then OpenBlAS will be used.
-
-CUDA/cuDNN
-+++++++++++
-
-PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
-parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
-automatically in order to speed up the build.
-
-PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
-keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
-you built.
-
-Pass Compile Options
-++++++++++++++
-
-You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
-When running cmake command, it will search system paths like
-:code:`/usr/lib:/usr/local/lib` and then search paths that you
-passed to cmake, i.e.
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
-
-**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
deleted file mode 100644
index c9ba84c842..0000000000
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-安装与编译
-==========
-
-.. _install_steps:
-
-安装流程
-++++++++
-
-PaddlePaddle提供pip和Docker的安装方式:
-
-.. toctree::
- :maxdepth: 1
-
- pip_install_cn.rst
- docker_install_cn.rst
- ../../howto/dev/build_cn.md
-
-编译流程
-++++++++
-
-.. warning::
-
- 建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
-
-.. toctree::
- :maxdepth: 1
-
- build_from_source_cn.rst
-
-常见问题解答
-++++++++++
-
-`常见问题解答 `_
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
deleted file mode 100644
index 9ecab5594c..0000000000
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-RNN相关模型
-===========
-
-.. toctree::
- :maxdepth: 1
-
- rnn_config_cn.rst
- recurrent_group_cn.md
- hierarchical_layer_cn.rst
- hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
deleted file mode 100644
index 7adc79873d..0000000000
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-RNN Models
-==========
-
-.. toctree::
- :maxdepth: 1
-
- rnn_config_en.rst
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
deleted file mode 100644
index 4a80a52451..0000000000
--- a/doc/howto/dev/build_cn.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# 用Docker编译和测试PaddlePaddle
-
-## 需要的软硬件
-
-为了开发PaddlePaddle,我们需要
-
-1. 一台电脑,可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统,以及
-1. Docker。
-
-不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker image 里。
-
-## 总体流程
-
-1. 获取源码
-
- ```bash
- git clone https://github.com/paddlepaddle/paddle
- ```
-
-2. 安装开发工具到 Docker image 里
-
- ```bash
- cd paddle; docker build -t paddle:dev .
- ```
-
- 请注意这个命令结尾处的 `.`;它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile),按照其内容创建一个名为 `paddle:dev` 的 Docker image,并且把各种开发工具安装进去。
-
-3. 编译
-
- 以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image,同时把当前目录(源码树根目录)映射为 container 里的 `/paddle` 目录,并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码,结果输出到 `/paddle/build`,也就是本地的源码树根目录里的 `build` 子目录。
-
- ```bash
- docker run --rm -v $PWD:/paddle paddle:dev
- ```
-
- 上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本,可以用
-
- ```bash
- docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
- ```
-
-4. 运行单元测试
-
- 用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试:
-
- ```bash
- NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
- ```
-
- 如果编译的时候我们用了 `WITH_GPU=OFF` 选项,那么编译过程只会产生 CPU-based 单元测试,那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要:
-
- ```bash
- docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
- ```
-
- 有时候我们只想运行一个特定的单元测试,比如 `memory_test`,我们可以
-
- ```bash
- nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
- ```
-
-5. 清理
-
- 有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要:
-
- ```bash
- rm -rf build
- ```
-
-## 为什么要 Docker 呀?
-
-- 什么是 Docker?
-
- 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
-
-- Docker 还是虚拟机?
-
- 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
-
-- 为什么用 Docker?
-
- 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
-
- 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。
-
-- 我可以选择不用Docker吗?
-
- 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
-
-- 学习 Docker 有多难?
-
- 理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
-
-- 我可以用 IDE 吗?
-
- 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
-
- 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
-
- ```emacs
- (global-set-key "\C-cc" 'compile)
- (setq compile-command
- "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
- ```
-
- 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
-
-- 可以并行编译吗?
-
- 是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
-
-## 可能碰到的问题
-
-- Docker 需要 sudo
-
- 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
-
-- 在 Windows/MacOS 上编译很慢
-
- Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
-
-- 磁盘不够
-
- 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
deleted file mode 100644
index 91c41ef8ce..0000000000
--- a/doc/howto/dev/build_en.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Build using Docker
-
-## What Developers Need
-
-To contribute to PaddlePaddle, you need
-
-1. A computer -- Linux, BSD, Windows, MacOS, and
-1. Docker.
-
-Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image.
-
-## General Process
-
-1. Retrieve source code.
-
- ```bash
- git clone https://github.com/paddlepaddle/paddle
- ```
-
-2. Install build tools into a Docker image.
-
- ```bash
- cd paddle; docker build -t paddle:dev .
- ```
-
- Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
-
-3. Build from source.
-
- This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile. `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
-
- ```bash
- docker run -v $PWD:/paddle paddle:dev
- ```
-
- Above command builds a CUDA-enabled version. If we want to build a CPU-only version, we can type
-
- ```bash
- docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
- ```
-
-4. Run unit tests.
-
- To run all unit tests using the first GPU of a node:
-
- ```bash
- NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
- ```
-
- If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them. We can just run
-
- ```bash
- docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
- ```
-
- Sometimes we want to run a specific unit test, say `memory_test`, we can run
-
- ```bash
- nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
- ```
-
-5. Clean Build.
-
- Sometimes, we might want to clean all thirt-party dependents and built binaries. To do so, just
-
- ```bash
- rm -rf build
- ```
-
-## Docker, Or Not?
-
-- What is Docker?
-
- If you haven't heard of it, consider it something like Python's virtualenv.
-
-- Docker or virtual machine?
-
- Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
-
-- Why Docker?
-
- Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
-
- Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
-
-- Can I choose not to use Docker?
-
- Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer. This document exists because Docker would make the development way easier.
-
-- How difficult is it to learn Docker?
-
- It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
-
-- Can I use my favorite IDE?
-
- Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like.
-
- Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file:
-
- ```emacs
- (global-set-key "\C-cc" 'compile)
- (setq compile-command
- "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
- ```
-
- so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
-
-- Does Docker do parallel building?
-
- Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
-
-## Some Gotchas
-
-- Docker requires sudo
-
- An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo.
-
-- Docker on Windows/MacOS builds slowly
-
- On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
-
-- Not enough disk space
-
- Examples in this article uses option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
deleted file mode 100644
index 1bc947c260..0000000000
--- a/doc/howto/dev/write_docs_cn.rst
+++ /dev/null
@@ -1,111 +0,0 @@
-##################
-如何贡献/修改文档
-##################
-
-PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
-
-如何构建文档
-============
-
-PaddlePaddle的文档构建有三种方式。
-
-
-使用PaddlePaddle.org工具
---------------
-这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。
-
-文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
-
-.. code-block:: bash
-
- mkdir paddlepaddle # Create paddlepaddle working directory
- cd paddlepaddle
-
- # Clone the content repositories
- git clone https://github.com/PaddlePaddle/Paddle.git
- git clone https://github.com/PaddlePaddle/book.git
- git clone https://github.com/PaddlePaddle/models.git
- git clone https://github.com/PaddlePaddle/Mobile.git
-
- # Please specify the working directory through -v
- docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
-
-注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
-编译后的文件将被存储在工作目录 /.ppo_workspace/content。
-
-如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
-
-.. code-block:: bash
-
- mkdir paddlepaddle # Create paddlepaddle working directory
- cd paddlepaddle
-
- # Clone the content repositories and PaddlePaddle.org
- git clone https://github.com/PaddlePaddle/Paddle.git
- git clone https://github.com/PaddlePaddle/book.git
- git clone https://github.com/PaddlePaddle/models.git
- git clone https://github.com/PaddlePaddle/Mobile.git
- git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
-
- # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
- export CONTENT_DIR=
- export ENV=''
- cd PaddlePaddle.org/portal/
- pip install -r requirements.txt
- python manage.py runserver
-
-工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
-编译后的文件将被存储在工作目录 /.ppo_workspace/content。
-
-想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。
-
-使用Docker构建
---------------
-
-使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
-
-.. code-block:: bash
-
- cd TO_YOUR_PADDLE_CLONE_PATH
- cd paddle/scripts/tools/build_docs
- sh build_docs.sh
-
-编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
-打开浏览器访问对应目录下的index.html即可访问本地文档。
-
-直接构建
---------
-
-如果提示正确,可以执行以下命令编译生成文档,即
-
-.. code-block:: bash
-
- cd TO_YOUR_PADDLE_CLONE_PATH
- mkdir -p build
- cd build
- cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
- make gen_proto_py
- make paddle_docs paddle_docs_cn
-
-编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
-打开浏览器访问对应目录下的index.html即可访问本地文档。
-
-
-如何书写文档
-============
-
-PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。
-
-如何更新www.paddlepaddle.org
-============================
-
-更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和
-`英文文档 `_ 。
-
-
-.. _cmake: https://cmake.org/
-.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst
deleted file mode 100644
index b3ef07eb1d..0000000000
--- a/doc/howto/dev/write_docs_en.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-##################
-Contribute Documentation
-##################
-
-PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
-Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
-When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
-
-How to Build Documentations
-============
-
-We recommend using PaddlePaddle.org tool to build documentation
-
-
-Use PaddlePaddle.org tool
---------------
-This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
-
-The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
-
-.. code-block:: bash
-
- mkdir paddlepaddle # Create paddlepaddle working directory
- cd paddlepaddle
-
- # Clone the content repositories. You may only clone the contents you need
- git clone https://github.com/PaddlePaddle/Paddle.git
- git clone https://github.com/PaddlePaddle/book.git
- git clone https://github.com/PaddlePaddle/models.git
- git clone https://github.com/PaddlePaddle/Mobile.git
-
- # Please specify the working directory through -v
- docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
-
-Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
-The compiled documentations will be stored in /.ppo_workspace/content
-
-
-If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
-
-.. code-block:: bash
-
- mkdir paddlepaddle # Create paddlepaddle working directory
- cd paddlepaddle
-
- # Clone the content repositories and PaddlePaddle.org
- git clone https://github.com/PaddlePaddle/Paddle.git
- git clone https://github.com/PaddlePaddle/book.git
- git clone https://github.com/PaddlePaddle/models.git
- git clone https://github.com/PaddlePaddle/Mobile.git
- git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
-
- # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
- export CONTENT_DIR=
- export ENV=''
- cd PaddlePaddle.org/portal/
- pip install -r requirements.txt
- python manage.py runserver
-
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
-The compiled documentations will be stored in /.ppo_workspace/content
-
-If you want to learn more on the PaddlePaddle.org, please `click here `_ 。
-
-How to write Documentations
-============
-
-PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail.
-
-
-How to update www.paddlepaddle.org
-============================
-
-Please create PRs and submit them to github, please check `Contribute Code `_ 。
-PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and
-`English Docs `_ 。
-
-.. _cmake: https://cmake.org/
-.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
deleted file mode 100644
index 991b9e2596..0000000000
--- a/doc/howto/index_cn.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-进阶指南
-========
-
-使用说明
---------
-
-.. toctree::
- :maxdepth: 1
-
- usage/cmd_parameter/index_cn.rst
- usage/cluster/cluster_train_cn.md
- usage/k8s/k8s_basis_cn.md
- usage/k8s/k8s_cn.md
- usage/k8s/k8s_distributed_cn.md
-
-开发标准
---------
-
-.. toctree::
- :maxdepth: 1
-
- dev/contribute_to_paddle_cn.md
- dev/write_docs_cn.rst
-
-模型配置
---------
-
-.. toctree::
- :maxdepth: 1
-
- deep_model/rnn/index_cn.rst
-
-性能优化
---------
-
-.. toctree::
- :maxdepth: 1
-
- optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
deleted file mode 100644
index 61bf25ccd1..0000000000
--- a/doc/howto/index_en.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-HOW TO
-=======
-
-Usage
--------
-
-.. toctree::
- :maxdepth: 1
-
- usage/cmd_parameter/index_en.rst
- usage/cluster/cluster_train_en.md
- usage/k8s/k8s_en.md
- usage/k8s/k8s_aws_en.md
-
-Development
-------------
-
-.. toctree::
- :maxdepth: 1
-
- dev/new_layer_en.rst
- dev/contribute_to_paddle_en.md
- dev/write_docs_en.rst
-
-Configuration
--------------
-
-.. toctree::
- :maxdepth: 1
-
- deep_model/rnn/index_en.rst
-
-Optimization
--------------
-
-.. toctree::
- :maxdepth: 1
-
- optimization/gpu_profiling_en.rst
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
deleted file mode 100644
index 383acb0c82..0000000000
--- a/doc/howto/read_source.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# PaddlePaddle Fluid Source Code Overview
-
-Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
-
-Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
-
-Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
-
-Optimizer: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer
-
-Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
-
-# Compile Time
-
-The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
-
-```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_cost = fluid.layers.mean(x=cost)
-
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-sgd_optimizer.minimize(avg_cost)
-```
-
-- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
-- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
- - Every Layer has one or more operators and variables/parameters
- - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
- - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
- - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h)
- - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
-- Optimizer: `fluid.optimizer.SGD`. It does the following
- - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
- - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
-
-# Run Time
-
-The following **evaluates** the NN. Instantiates all the variables, operators.
-
-```python
-place = fluid.CPUPlace()
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
-
-# Allocate memory. Initialize Parameter.
-exe.run(fluid.default_startup_program())
-
-# Allocate memory. Do computation.
-exe.run(fluid.default_main_program(),
- feed=feeder.feed(data),
- fetch_list=[avg_cost])
-```
-
-- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
- - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
-- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
- - Feeds the data: `feed=feeder.feed(data)`
- - Evaluates all the operators
- - Fetches the result: `fetch_list=[avg_cost]`
-- Other worth looking files:
- - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
- - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
- - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
deleted file mode 100644
index 2e98b3de3f..0000000000
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ /dev/null
@@ -1,285 +0,0 @@
-# PaddlePaddle分布式训练
-
-* [概述](#概述)
-* [环境准备](#环境准备)
-* [启动参数说明](#启动参数说明)
- * [启动参数服务器](#启动参数服务器)
- * [启动计算节点](#启动计算节点)
- * [准备数据集](#准备数据集)
- * [准备训练程序](#准备训练程序)
-* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
- * [使用Fabric启动集群作业](#使用fabric启动集群作业)
- * [准备一个Linux集群](#准备一个linux集群)
- * [启动集群作业](#启动集群作业)
- * [终止集群作业](#终止集群作业)
- * [检查集群训练结果](#检查集群训练结果)
- * [检查模型输出](#检查模型输出)
- * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
- * [准备OpenMPI集群](#准备OpenMPI集群)
- * [启动集群作业](#启动集群作业-1)
- * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
-
-## 概述
-本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
-
-
-
-- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
-- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
-- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
-
-这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
-
-在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
-
-## 环境准备
-
-1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
-
-安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
-```bash
-$ paddle version
-PaddlePaddle 0.10.0, compiled with
- with_avx: ON
- with_gpu: OFF
- with_double: OFF
- with_python: ON
- with_rdma: OFF
- with_timer: OFF
-```
-
-下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
-
-## 启动参数说明
-### 启动参数服务器
-执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
-```bash
-$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
-```
-
-如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行:
-```bash
-$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
-```
-
-| 参数 | 是否必选 | 默认值 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| port | 必选 | 7164 | pserver监听的起始端口,根据ports_num决定 总端口个数,从起始端口监听多个端口用于通信 |
-| ports_num | 必选 | 1 | 监听的端口个数 |
-| ports_num_for_sparse | 必选 | 1 | 用于稀疏类型参数通信的端口个数 |
-| num_gradient_servers | 必选 | 1 | 当前训练任务pserver总数 |
-
-### 启动计算节点
-执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py)
-```bash
-$ python train.py
-```
-
-trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量(https://zh.wikipedia.org/wiki/环境变量 )或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。
-
-使用环境变量:
-
-```bash
-export PADDLE_INIT_USE_GPU=False
-export PADDLE_INIT_TRAINER_COUNT=1
-export PADDLE_INIT_PORT=7164
-export PADDLE_INIT_PORTS_NUM=1
-export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
-export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
-export PADDLE_INIT_TRAINER_ID=0
-export PADDLE_INIT_PSERVERS=127.0.0.1
-```
-
-使用参数:
-
-```python
-paddle.init(
- use_gpu=False,
- trainer_count=1,
- port=7164,
- ports_num=1,
- ports_num_for_sparse=1,
- num_gradient_servers=1,
- trainer_id=0,
- pservers="127.0.0.1")
-```
-
-| 参数 | 是否必选 | 默认 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu | 可选 | False | 是否启用GPU训练 |
-| trainer_count | 必选 | 1 | 当前训练任务trainer总个数 |
-| port | 必选 | 7164 | 连接到pserver的端口 |
-| ports_num | 必选 | 1 | 连接到pserver的端口个数 |
-| ports_num_for_sparse | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数 |
-| num_gradient_servers | 必选 | 1 | 当前训练任务pserver总数 |
-| trainer_id | 必选 | 0 | 每个trainer的唯一ID,从0开始的整数 |
-| pservers | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 |
-
-
-### 准备数据集
-
-参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
-
-在线上系统中,通常会使用MapReduce任务的输出结果作为训练结果,这样训练文件的个数会比较多,而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件:
-
-```python
-import os
-train_list = []
-flist = os.listdir("/train_data/")
-for f in flist:
- suffix = int(f.split("-")[1])
- if suffix % TRAINER_COUNT == TRAINER_ID:
- train_list.append(f)
-```
-
-示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`):
-```
-train.txt
-train.txt-00000
-train.txt-00001
-train.txt-00002
-test.txt
-test.txt-00000
-test.txt-00001
-test.txt-00002
-```
-
-在进行分布式训练时,每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中,系统会提供一个分布式存储服务,这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储,则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。
-
-对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
-
-### 准备训练程序
-
-我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
-
-最后,工作空间应如下所示:
-```
-.
-|-- my_lib.py
-|-- word_dict.pickle
-|-- train.py
-|-- train_data_dir/
-| |-- train.txt-00000
-| |-- train.txt-00001
-| |-- train.txt-00002
-`-- test_data_dir/
- |-- test.txt-00000
- |-- test.txt-00001
- `-- test.txt-00002
-```
-
-- `my_lib.py`:会被`train.py`调用的一些用户定义的库函数,比如PIL库等。
-- `word_dict.pickle`:在`train.py`中会使用到的字典数据文件。
-- `train.py`:训练程序,代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意:*** 对于本样例代码,在使用不同的分布式计算平台时,您可能需要修改`train.py`开头的部分(如下),以便获得训练数据的位置和获取环境变量配置:
-
- ```python
- cluster_train_file = "./train_data_dir/train/train.txt"
- cluster_test_file = "./test_data_dir/test/test.txt"
- node_id = os.getenv("OMPI_COMM_WORLD_RANK")
- if not node_id:
- raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
- ```
-
-- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。
-- `test_data_dir`:包含测试数据集的目录。
-
-## 使用分布式计算平台或工具
-
-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
-- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
-- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
-- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
-
-对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
-
-在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
-
-### 使用Fabric启动集群作业
-
-#### 准备一个Linux集群
-可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下,执行`kubectl -f ssh_servers.yaml`启动一个测试集群,并使用`kubectl get po -o wide`获得这些节点的IP地址。
-
-#### 启动集群作业
-
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
-
-`paddle.py` 为方便作业启动提供了两个独特的命令选项。
-
-- `job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。
-- `job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
-
-`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务,只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后:
-```
-sh run.sh
-```
-
-集群作业将会在几秒后启动。
-
-#### 终止集群作业
-`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
-
-#### 检查集群训练结果
-详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。
-
-`paddle_trainer.INFO`
-提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。
-
-`paddle_pserver2.INFO`
-提供 pserver 运行日志,有助于诊断分布式错误。
-
-`server.log`
-提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-`train.log`
-提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-#### 检查模型输出
-运行完成后,模型文件将被写入节点 0 的 `output` 目录中。
-工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
-
-### 在OpenMPI集群中提交训练作业
-
-#### 准备OpenMPI集群
-
-执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点:
-
-```bash
-paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
-
-#### 启动集群作业
-
-您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务:
-
-```bash
-# 获得head和node节点的IP地址
-kubectl get po -o wide
-# 将node节点的IP地址保存到machines文件中
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# 拷贝必要的文件到head节点
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# ssh 登录到head节点
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- 以下操作均在head节点中执行 ---------------
-# 准备训练数据
-python prepare.py
-# 拷贝训练程序和字典文件到每台MPI节点
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# 创建日志目录
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# 拷贝训练数据到各自的节点
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# 启动训练任务
-mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh
-```
-
-### 在Kubernetes集群中提交训练作业
-
-此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
deleted file mode 100644
index baa97c0c02..0000000000
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ /dev/null
@@ -1,293 +0,0 @@
-# PaddlePaddle Distributed Training
-
-* [Introduction](#introduction)
-* [Preparations](#preparations)
-* [Command-line arguments](#command-line-arguments)
- * [Starting parameter server](#starting-parameter-server)
- * [Starting trainer](#starting-trainer)
- * [Prepare Training Dataset](#prepare-training-dataset)
- * [Prepare Training program](#prepare-training-program)
-* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
- * [Cluster Training Using Fabric](#cluster-training-using-fabric)
- * [Prepare a Linux cluster](#prepare-a-linux-cluster)
- * [Launching Cluster Job](#launching-cluster-job)
- * [Kill Cluster Job](#kill-cluster-job)
- * [Check Cluster Training Result](#check-cluster-training-result)
- * [Check Model Output](#check-model-output)
- * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
- * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
- * [Launching Cluster Job](#launching-cluster-job-1)
- * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
-
-## Introduction
-
-In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
-
-
-
-- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
-- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
-- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
-
-PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
-
-When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
-
-## Preparations
-1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
-
-After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
-
-```bash
-$ paddle version
-PaddlePaddle 0.10.0rc, compiled with
- with_avx: ON
- with_gpu: OFF
- with_double: OFF
- with_python: ON
- with_rdma: OFF
- with_timer: OFF
-```
-
-We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
-
-## Command-line arguments
-
-### Starting parameter server
-
-Type the below command to start a parameter server which will wait for trainers to connect:
-
-```bash
-$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
-```
-
-If you wish to run parameter servers in background, and save a log file, you can type:
-```bash
-$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
-```
-
-| param | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| port | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
-| ports_num | required | 1 | total number of ports will listen on |
-| ports_num_for_sparse | required | 1 | number of ports which serves sparse parameter update |
-| num_gradient_servers | required | 1 | total number of gradient servers |
-
-### Starting trainer
-Type the command below to start the trainer(name the file whatever you want, like "train.py")
-
-```bash
-$ python train.py
-```
-
-Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables.
-
-Use environment viriables:
-
-```bash
-export PADDLE_INIT_USE_GPU=False
-export PADDLE_INIT_TRAINER_COUNT=1
-export PADDLE_INIT_PORT=7164
-export PADDLE_INIT_PORTS_NUM=1
-export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
-export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
-export PADDLE_INIT_TRAINER_ID=0
-export PADDLE_INIT_PSERVERS=127.0.0.1
-python train.py
-```
-
-Pass arguments:
-
-```python
-paddle.init(
- use_gpu=False,
- trainer_count=1,
- port=7164,
- ports_num=1,
- ports_num_for_sparse=1,
- num_gradient_servers=1,
- trainer_id=0,
- pservers="127.0.0.1")
-```
-
-| param | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu | optional | False | set to "True" to enable GPU training |
-| trainer_count | required | 1 | total count of trainers in the training job |
-| port | required | 7164 | port to connect to parameter server |
-| ports_num | required | 1 | number of ports for communication |
-| ports_num_for_sparse | required | 1 | number of ports for sparse type caculation |
-| num_gradient_servers | required | 1 | total number of gradient server |
-| trainer_id | required | 0 | ID for every trainer, start from 0 |
-| pservers | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
-
-### Prepare Training Dataset
-
-Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
-
-In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers:
-
-```python
-import os
-train_list = []
-flist = os.listdir("/train_data/")
-for f in flist:
- suffix = int(f.split("-")[1])
- if suffix % TRAINER_COUNT == TRAINER_ID:
- train_list.append(f)
-```
-
-Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
-
-```
-train.txt
-train.txt-00000
-train.txt-00001
-train.txt-00002
-test.txt
-test.txt-00000
-test.txt-00001
-test.txt-00002
-```
-
-When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node.
-
-Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
-
-### Prepare Training program
-
-We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
-
-
-Your workspace may looks like:
-```
-.
-|-- my_lib.py
-|-- word_dict.pickle
-|-- train.py
-|-- train_data_dir/
-| |-- train.txt-00000
-| |-- train.txt-00001
-| |-- train.txt-00002
-`-- test_data_dir/
- |-- test.txt-00000
- |-- test.txt-00001
- `-- test.txt-00002
-```
-
-- `my_lib.py`: user defined libraries, like PIL libs. This is optional.
-- `word_dict.pickle`: dict file for training word embeding.
-- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
-
- ```python
- cluster_train_file = "./train_data_dir/train/train.txt"
- cluster_test_file = "./test_data_dir/test/test.txt"
- node_id = os.getenv("OMPI_COMM_WORLD_RANK")
- if not node_id:
- raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
- ```
-
-- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
-- `test_data_dir`: containing testing data.
-
-## Use cluster platforms or cluster management tools
-
-PaddlePaddle supports running jobs on several platforms including:
-- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
-- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
-- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
-
-We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
-
-These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
-
-### Cluster Training Using Fabric
-
-#### Prepare a Linux cluster
-
-Run `kubectl -f ssh_servers.yaml` under the directory: `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
-
-#### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
-
-`paddle.py`provides two distinguished command option for easy job launching.
-
-- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
-- `job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
-
-`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
-```
-sh run.sh
-```
-
-The cluster Job will start in several seconds.
-
-#### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
-
-#### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
-
-`paddle_trainer.INFO`
-It provides almost all internal output log for training, same as local training. Check runtime model convergence here.
-
-`paddle_pserver2.INFO`
-It provides parameter server running log, which could help to diagnose distributed error.
-
-`server.log`
-It provides stderr and stdout of parameter server process. Check error log if training crashes.
-
-`train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashes.
-
-#### Check Model Output
-After one pass finished, model files will be written in `output` directory in node 0.
-`nodefile` in workspace indicates the node id of current cluster job.
-
-### Cluster Training Using OpenMPI
-
-#### Prepare an OpenMPI cluster
-
-Run the following command to start a 3-node MPI cluster and one "head" node.
-
-```bash
-cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-Then you can log in to every OpenMPI node using ssh without input any passwords.
-
-#### Launching Cluster Job
-
-Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
-
-```bash
-# find out node IP addresses
-kubectl get po -o wide
-# generate a "machines" file containing node IP addresses
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# copy necessary files onto "head" node
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# login to head node using ssh
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- in head node ---------------
-# prepare training data
-python prepare.py
-# copy training data and dict file to MPI nodes
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# creat a directory for storing log files
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# copy training data to every node
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# start the job
-mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh
-```
-
-### Cluster Training Using Kubernetes
-
-The details can be found [here](../k8s/k8s_cn.md)
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/usage/cmd_parameter/index_cn.rst
deleted file mode 100644
index 4c87298211..0000000000
--- a/doc/howto/usage/cmd_parameter/index_cn.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-.. _cmd_line_index:
-
-设置命令行参数
-===============
-
-.. toctree::
- :maxdepth: 1
-
- use_case_cn.md
- arguments_cn.md
- detail_introduction_cn.md
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
deleted file mode 100644
index 4c3dc81ed3..0000000000
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Kubernetes 简介
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统,其提供应用部署、维护、扩展机制等功能,利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行,且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws),[Azure](http://kubernetes.io/docs/getting-started-guides/azure/),[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前,需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识,下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点,这个节点可以是物理机或者虚拟机,Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器,pod是Kubernetes的最小调度单元,一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET,PID,IPC,UTS等Linux namespace。由于容器之间共享NET namespace,所以它们使用同一个IP地址,可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业,一次作业称为一个job,通常每个job包括一个或者多个pods,job启动后会创建这些pod并开始执行一个程序,等待这个程序执行成功并返回0则成功退出,如果执行失败,也可以配置不同的重试机制。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷,是pod内的容器都可以访问的共享目录,也是容器与node之间共享文件的方式,因为容器内的文件都是暂时存在的,当容器因为各种原因被销毁时,其内部的文件也会随之消失。通过volume,就可以将这些文件持久化存储。Kubernetes支持多种volume,例如hostPath(宿主机目录),gcePersistentDisk,awsElasticBlockStore等。
-
-- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间,在kubernetes中创建的所有资源对象(例如上文的pod,job)等都属于一个命名空间,在同一个命名空间中,资源对象的名字是唯一的,不同空间的资源名可以重复,命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
-
-- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合,将外部的存储服务在Kubernetes中描述成为统一的资源形式,便于存储资源管理和Pod引用。
-
-## 部署Kubernetes集群
-
-Kubernetes提供了多种集群部署的方案,本文档内不重复介绍。这里给出集中常见的部署方法:
-
-- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器,便于本地验证和测试。
-- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统,不同主机(Bare-Metal, AWS, GCE)条件下,快速部署集群。
-- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
-- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
-
-可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
-
-## 选择存储方案
-
-容器不会保留在运行时生成的数据,job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务,需要有一个外部的存储服务来保存训练所需数据和训练输出。
-常见的可选存储服务包括:
-
-- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单,可以用于小量数据的验证。不提供分布式存储,高可用,冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
-- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统,可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
-- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统,支持rbd,POSIX API接口(ceph fs)和对象存储API,参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
-- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
-
-## 配置kubectl
-
-### 安装kubectl
-```
-# OS X
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
-
-# Linux
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
-
-# Windows
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
-```
-
-### 配置kubectl访问你的kubernetes集群
-
-编辑`~/.kube/config`这个配置文件,修改`Master-IP`的地址。如果使用SSL认证,则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问(比如通过8080端口),也可以去掉这些证书的配置。
-```
-apiVersion: v1
-clusters:
-- cluster:
- certificate-authority: /path/to/ca.crt
- server: https://[Master-IP]:443
- name: minikube
-contexts:
-- context:
- cluster: minikube
- user: minikube
- name: minikube
-current-context: minikube
-kind: Config
-preferences: {}
-users:
-- name: minikube
- user:
- client-certificate: /path/to/apiserver.crt
- client-key: /Users/wuyi/.minikube/apiserver.key
-```
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
deleted file mode 100644
index 2183a232ad..0000000000
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and /dev/null differ
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index 424d7718c6..cdd6917239 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,8 +1,9 @@
# Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
-- 基于Docker容器的编译方式
-- 基于Linux交叉编译环境的编译方式
+
+- [基于Docker容器的编译方式](#基于docker容器的编译方式)
+- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
## 基于Docker容器的编译方式
Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
@@ -16,6 +17,18 @@ $ cd Paddle
$ docker build -t username/paddle-android:dev . -f Dockerfile.android
```
+用户也可以使用PaddlePaddle提供的官方开发镜像:
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+对于国内用户,我们提供了加速访问的镜像源:
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
### 编译PaddlePaddle C-API库
构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数:
@@ -41,23 +54,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数:
ANDROID_API
-
>= 21
+
>= 16
21
- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库
- ```bash
- $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
- ```
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```
- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库
- ```bash
- $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
- ```
-执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```
+
+执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
## 基于Linux交叉编译环境的编译方式
本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
@@ -83,6 +98,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
- 构建`arm64-v8a`、 `Android API 21`的独立工具链:
+
```bash
your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
--arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
@@ -90,14 +106,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
-注意:**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
-
### 配置交叉编译参数
CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数:
-- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。
+- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译PaddlePaddle所需的所有第三方库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`)。
- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。
- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
@@ -119,7 +133,7 @@ Android平台可选配置参数:
其他配置参数:
- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`。
-- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。
+- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。
常用的cmake配置如下:
@@ -147,9 +161,14 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
..
```
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`,最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`,获得最快的执行速度,
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
+
- 设置`CMAKE_BUILD_TYPE`为`Release`
- 使用`clang`编译工具链
- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 26858581fc..6af16fc114 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -1,6 +1,9 @@
# Build PaddlePaddle for Android
-There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker.
+There are two approaches to build PaddlePaddle for Android:
+
+- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
+- [Cross-Compiling on Linux](#cross-compiling-on-linux)
## Cross-Compiling Using Docker
@@ -16,6 +19,18 @@ $ cd Paddle
$ docker build -t paddle:dev-android . -f Dockerfile.android
```
+Users can directly use the published Docker image.
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
### Build the Inference Library
We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -47,7 +62,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
ANDROID_API
-
>= 21
+
>= 16
21
@@ -77,31 +92,29 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
- To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
- ```bash
- your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
- --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
- ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+ --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
- To build the standalone toolchain for `arm64-v8a` and Android API level 21:
- ```bash
- your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
- --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
- ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+ --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
-**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
-
### Cross-Compiling Arguments
CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
Some other CMake arguments you need to know:
-- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
+- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
@@ -123,7 +136,7 @@ Some Android-specific arguments:
Other useful arguments:
- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`.
-- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
Some frequent configurations for your reference:
@@ -158,6 +171,7 @@ There are some other arguments you might want to configure.
- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+
- `CMAKE_BUILD_TYPE=Release`
- `ANDROID_TOOLCHAIN=clang`
- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
index 9da48e7f21..d5196d9a4c 100644
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -18,11 +18,11 @@ PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/
- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后,PaddlePaddle的CMake系统会自动编译所有的第三方依赖库,并且强制设置一些PaddlePaddle参数的值(`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。
- `WITH_C_API`,是否编译C-API预测库,必须设置为ON。在iOS平台上只支持使用C-API来预测。
-- `WITH_SWIG_PY`,必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。
+- `WITH_SWIG_PY`,必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
iOS平台可选配置参数:
-- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`。
+- `IOS_PLATFORM`,可设置为`OS`(默认值)或`SIMULATOR`。
- `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构:
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
new file mode 100644
index 0000000000..19bfe86c51
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -0,0 +1,120 @@
+# Build PaddlePaddle for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+ - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+ - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+
+
+
+
+
+
+
+
IOS_PLATFORM
+
IOS_ARCH
+
+
+
+
+
OS
+
armv7, armv7s, arm64
+
+
+
SIMULATOR
+
i386, x86_64
+
+
+
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings:
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+ -DIOS_PLATFORM=OS \
+ -DIOS_ARCH="armv7;arm64" \
+ -DIOS_ENABLE_BITCODE=ON \
+ -DIOS_USE_VECLIB_FOR_BLAS=ON \
+ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+ -DWITH_C_API=ON \
+ -DWITH_TESTING=OFF \
+ -DWITH_SWIG_PY=OFF \
+ ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+ -DIOS_PLATFORM=SIMULATOR \
+ -DIOS_ARCH="x86_64" \
+ -DIOS_USE_VECLIB_FOR_BLAS=ON \
+ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+ -DWITH_C_API=ON \
+ -DWITH_TESTING=OFF \
+ -DWITH_SWIG_PY=OFF \
+ ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Build and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `make install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
deleted file mode 100644
index 1d99666e58..0000000000
--- a/doc/mobile/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-.. toctree::
- :maxdepth: 1
-
- cross_compiling_for_android_cn.md
- cross_compiling_for_ios_cn.md
- cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
deleted file mode 100644
index 3c08d73671..0000000000
--- a/doc/mobile/index_en.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-MOBILE
-======
-
-.. toctree::
- :maxdepth: 1
-
- cross_compiling_for_android_en.md
- cross_compiling_for_raspberry_en.md
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 41b35b5b23..260b6c9fd1 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -21,10 +21,11 @@ import paddle.v2
MarkdownParser = parser.CommonMarkParser
AutoStructify = transform.AutoStructify
+
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
# -- General configuration ------------------------------------------------
@@ -82,7 +83,7 @@ language = 'zh_CN'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_en*', '*_en*']
+exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']
# The reST default role (used for this markup: `text`) to use for all
# documents.
@@ -120,7 +121,7 @@ html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
+#html_static_path = []
# Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc'
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index 5822c2481d..e5757b86b4 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -22,10 +22,11 @@ import paddle.v2
MarkdownParser = parser.CommonMarkParser
AutoStructify = transform.AutoStructify
+
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
# -- General configuration ------------------------------------------------
@@ -82,7 +83,7 @@ language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']
# The reST default role (used for this markup: `text`) to use for all
# documents.
@@ -120,7 +121,7 @@ html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
+#html_static_path = []
# Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc'
diff --git a/doc/templates/layout.html b/doc/templates/layout.html
index 47329c2a92..5091eb32ea 100644
--- a/doc/templates/layout.html
+++ b/doc/templates/layout.html
@@ -2,6 +2,13 @@
{# Import the theme's layout. #}
{% extends "!layout.html" %}
+{# SIDE NAV, TOGGLES ON MOBILE #}
+{% block menu %}
+
+{% endblock %}
{%- block extrahead %}
-
- {% endblock %}
-
- {# Keep modernizr in head - http://modernizr.com/docs/#installing #}
-
-
-
-
-
-
- {% block extrabody %}
-
-