Merge pull request #2081 from PaddlePaddle/release/0.10.0

Release/0.10.0
9 years ago · 2c98becba4
parent 492730b8d6 1b83092db1
commit 2c98becba4
1305 changed files with 67651 additions and 25544 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,15 @@
+*.DS_Store
+build/
+*.user
+.vscode
+.idea
+.project
+.cproject
+.pydevproject
+Makefile
+.test_env/
+third_party/
+*~
+bazel-*
+
+!build/*.deb
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 *.DS_Store
 build/
+build_doc/
 *.user

 .vscode
@ -7,4 +8,11 @@ build/
 .project
 .cproject
 .pydevproject
+.settings/
 Makefile
+.test_env/
+third_party/
+
+*~
+bazel-*
+third_party/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,24 +1,23 @@
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: c25201a00e6b0514370501050cf2a8538ac12270
+    sha: v1.0.1
    hooks:
    -   id: remove-crlf
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
    sha: v0.13.2
    hooks:
    -   id: yapf
+        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
    hooks:
    -   id: check-added-large-files
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown 
-# files now, please not add it to pre-commit hook now
-#    -   id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for 
-# documenation
-#    -   id: debug-statements
+-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+    hooks:
+    -   id: clang-formater
--- a/.travis.yml
+++ b/.travis.yml
@ -1,58 +1,59 @@
 language: cpp
-cache: ccache
+cache:
+  directories:
+    - $HOME/third_party
+    - $HOME/.ccache
+    - $HOME/.cache/pip
 sudo: required
 dist: trusty
 os:
  - linux
-  - osx
 env:
  - JOB=DOCS
  - JOB=BUILD_AND_TEST
-matrix:
-  exclude:
-    - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux
-
+  - JOB=PRE_COMMIT
 addons:
  apt:
    packages:
      - gcc-4.8
      - g++-4.8
-      - wget
+      - gfortran-4.8
      - git
      - build-essential
-      - libatlas-base-dev
      - python
      - python-pip
      - python2.7-dev
-      - m4
-      - libprotobuf-dev
-      - doxygen
-      - protobuf-compiler
-      - python-protobuf
      - python-numpy
      - python-wheel
-      - libgoogle-glog-dev
-      - libgflags-dev
-      - libgtest-dev
      - curl
-      - lcov
-      - graphviz
      - swig
+      - graphviz
+      - clang-format-3.8
+      - automake
+      - libtool
+      - ccache
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
-      then
-        echo "Only markdown docs were updated, stopping build process."
-        exit
+      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
+      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
+        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
+        then
+          echo "Only markdown docs were updated, stopping build process."
+          exit
+        fi
      fi
    fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf 'sphinx==1.4.9' breathe recommonmark virtualenv numpy
+  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # protobuf version.
+  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
+  - |
+    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
-  - paddle/scripts/travis/main.sh
+  - | 
+    timeout 2580 paddle/scripts/travis/main.sh  # 43min timeout
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
  email:
    on_success: change
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1 @@
+./doc/howto/dev/contribute_to_paddle_en.md
--- a/63
+++ b/63
@ -0,0 +1,63 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_STYLE_CHECK
+
+ENV WOBOQ OFF
+ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-OFF}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+
+ENV HOME /root
+# Add bash enhancements
+COPY ./paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
+    apt-get install -y automake locales clang-format-3.8 swig doxygen && \
+    apt-get clean -y
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# version util jupyter fixes this issue.
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
+    cd .. && rm -rf cmake-3.4.1
+
+VOLUME ["/woboq_out"]
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+
+# development image default do build work
+CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
--- a/4
+++ b/4
@ -1,4 +1,4 @@
-Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved

                                 Apache License
                           Version 2.0, January 2004
@ -188,7 +188,7 @@ Copyright (c) 2016 Baidu, Inc. All Rights Reserved
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/README.md
+++ b/README.md
@ -1,10 +1,13 @@
 # PaddlePaddle


-[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
-[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+

 Welcome to the PaddlePaddle GitHub.

@ -14,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.

 Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.

 ## Features

@ -26,15 +29,15 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
    connection.

 -  **Efficiency**
-  
+
    In order to unleash the power of heterogeneous computing resource,
    optimization occurs at different levels of PaddlePaddle, including
    computing, memory, architecture and communication. The following are some
    examples:

      - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels. 
-      - Highly optimized recurrent networks which can handle **variable-length** 
+      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      - Highly optimized recurrent networks which can handle **variable-length**
      sequence without padding.
      - Optimized local and distributed training for models with high dimensional
      sparse data.
@ -56,42 +59,40 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
    the capability of PaddlePaddle to make a huge impact for your product.

 ## Installation
-Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or 
-directly build on **Linux** and **Mac OS X** from the source code.
- 
+
+It is recommended to check out the
+[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+before looking into the
+[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+
 ## Documentation
-Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
-
- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
-   You can follow the quick start tutorial to learn how use PaddlePaddle
-   step-by-step.
-    
- [Example and Demo](http://paddlepaddle.org/doc/demo/) <br>
-   We provide five demos, including: image classification, sentiment analysis,
-   sequence to sequence model, recommendation, semantic role labeling. 
-   
- [Distributed Training](http://paddlepaddle.org/doc/cluster) <br>
-  This system supports training deep learning models on multiple machines
-  with data parallelism.
-   
- [Python API](http://paddlepaddle.org/doc/ui/) <br>
-   PaddlePaddle supports using either Python interface or C++ to build your
-   system. We also use SWIG to wrap C++ source code to create a user friendly
-   interface for Python. You can also use SWIG to create interface for your
-   favorite programming language.
- 
- [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html) <br>
-   We sincerely appreciate your interest and contributions. If you would like to
-   contribute, please read the contribution guide.   
-
- [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
+
+We provide [English](http://www.paddlepaddle.org/develop/doc/) and
+[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+
+- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+
+  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+
+- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+
+  You can run distributed training jobs on MPI clusters.
+
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+
+   You can also run distributed training jobs on Kubernetes clusters.
+
+- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+
+   Our new API enables much shorter programs.
+
+- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+
+   We appreciate your contributions!

 ## Ask Questions
-Please join the [**gitter chat**](https://gitter.im/PaddlePaddle/Deep_Learning) or send email to
-**paddle-dev@baidu.com** to ask questions and talk about methods and models.
-Framework development discussions and
-bug reports are collected on [Issues](https://github.com/baidu/paddle/issues).
+
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).

 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
@ -0,0 +1,80 @@
+# v0.10.0版本
+
+我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
+
+- 旧的Python API由于难以学习和使用已经过时了。使用旧版本的API至少需要两份python文件，分别是定义数据生成器和定义网络拓扑结构的文件。用户通过运行`paddle_trainer`的C++程序来启动PaddlePaddle任务，该程序调用Python解释器来运行定义网络拓扑结构的文件，然后通过迭代加载数据生成器提供的小批量数据启动训练循环。这与Python的现代编辑方式不符，比如Jupyter Notebook。
+
+- 新版的API被称为 *V2 API*，允许我们在单个.py文件中，通过编辑更短的Python程序来定义网络结构和数据。此外，该Python程序也可以在Jupyter Notebook中运行，因为PaddlePaddle可以作为共享库来被Python程序加载和使用。
+
+基于新的API，我们提供了一个在线的学习文档 [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) 及其[中文版本](http://book.paddlepaddle.org/)。
+
+我们还致力于迭代更新新版API的在线文档，并将新版API引入分布式集群（包括MPI和Kubernetes）训练中。我们将在下一个版本中发布更多的内容。
+
+## 新特点
+
+* 发布新版[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
+* 发布深度学习系列课程 [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) 及其[中文版本](http://book.paddlepaddle.org/)。
+* 支持矩形输入的CNN。
+* 为seqlastin和seqfirstin提供stride pooling。
+* 在`trainer_config_helpers`中暴露`seq_concat_layer/seq_reshape_layer`。
+* 添加公共数据集包：CIFAR，MNIST，IMDB，WMT14，CONLL05，movielens，imikolov。
+* 针对Single Shot Multibox Detection增加 Prior box layer。
+* 增加光滑的L1损失。
+* 在V2 API中增加 data reader 创建器和修饰器。
+* 增加cmrnorm投影的CPU实现。
+
+
+## 改进
+
+* 提供`paddle_trainer`的Python virtualenv支持。
+* 增加代码自动格式化的pre-commit hooks。
+* 升级protobuf到3.x版本。
+* 在Python数据生成器中提供一个检测数据类型的选项。
+* 加速GPU中average层的后向反馈计算。
+* 细化文档。
+* 使用Travis-CI检查文档中的死链接。
+* 增加解释`sparse_vector`的示例。
+* 在layer_math.py中添加ReLU。
+* 简化Quick Start示例中的数据处理流程。
+* 支持CUDNN Deconv。
+* 在v2 API中增加数据feeder。
+* 在情感分析示例的演示中增加对标准输入流中样本的预测。
+* 提供图像预处理的多进程接口。
+* 增加V1 API的基准文档。
+* 在`layer_math.py`中增加ReLU。
+* 提供公共数据集的自动下载包。
+* 将`Argument::sumCost`重新命名为`Argument::sum`，并暴露给python。
+* 为矩阵相关的表达式评估增加一个新的`TensorExpression`实现。
+* 增加延迟分配来优化批处理多表达式计算。
+* 增加抽象的类函数及其实现：
+  * `PadFunc` 和 `PadGradFunc`。
+  * `ContextProjectionForwardFunc` 和 `ContextProjectionBackwardFunc`。
+  * `CosSimBackward` 和 `CosSimBackwardFunc`。
+  * `CrossMapNormalFunc` 和 `CrossMapNormalGradFunc`。
+  * `MulFunc`。
+* 增加`AutoCompare`和`FunctionCompare`类，使得编写比较gpu和cpu版本函数的单元测试更容易。
+* 生成`libpaddle_test_main.a`并删除测试文件内的主函数。
+* 支持PyDataProvider2中numpy的稠密向量。
+* 清理代码库，删除一些复制粘贴的代码片段：
+  * 增加`SparseRowMatrix`的抽样类`RowBuffer`。
+  * 清理`GradientMachine`的接口。
+  * 在layer中增加`override`关键字。
+  * 简化`Evaluator::create`，使用`ClassRegister`来创建`Evaluator`。
+* 下载演示的数据集时检查MD5校验。
+* 添加`paddle::Error`，用于替代Paddle中的`LOG(FATAL)`。
+
+
+## 错误修复
+
+* 检查`recurrent_group`的layer输入类型。
+* 不要用.cu源文件运行`clang-format`。
+* 修复`LogActivation`的使用错误。
+* 修复运行`test_layerHelpers`多次的错误。
+* 修复seq2seq示例超出消息大小限制的错误。
+* 修复在GPU模式下dataprovider转换的错误。
+* 修复`GatedRecurrentLayer`中的错误。
+* 修复在测试多个模型时`BatchNorm`的错误。
+* 修复paramRelu在单元测试时崩溃的错误。
+* 修复`CpuSparseMatrix`编译时相关的警告。
+* 修复`MultiGradientMachine`在`trainer_count > batch_size`时的错误。
+* 修复`PyDataProvider2`阻止异步加载数据的错误。
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,3 +1,106 @@
+# Release v0.10.0
+
+We are glad to release version 0.10.0.  In this version, we are happy to
+release the
+new
+[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+
+- Our old Python API is kind of out of date.  It's hard to learn and hard to
+  use.  To write a PaddlePaddle program using the old API, we'd have to write
+  at least two Python files: one `data provider` and another one that defines
+  the network topology.  Users start a PaddlePaddle job by running the
+  `paddle_trainer` C++ program, which calls Python interpreter to run the
+  network topology configuration script and then start the training loop,
+  which iteratively calls the data provider function to load minibatches.
+  This prevents us from writing a Python program in a modern way, e.g., in the
+  Jupyter Notebook.
+  
+- The new API, which we often refer to as the *v2 API*, allows us to write
+  much shorter Python programs to define the network and the data in a single
+  .py file.  Also, this program can run in Jupyter Notebook, since the entry
+  point is in Python program and PaddlePaddle runs as a shared library loaded
+  and invoked by this Python program.
+  
+Basing on the new API, we delivered an online interative
+book, [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+and [its Chinese version](http://book.paddlepaddle.org/).
+
+We also worked on updating our online documentation to describe the new API.
+But this is an ongoing work.  We will release more documentation improvements
+in the next version.
+
+We also worked on bring the new API to distributed model training (via MPI and
+Kubernetes).  This work is ongoing. We will release more about it in the next
+version.
+
+## New Features
+
+* We release [new Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+* Deep Learning 101 book in [English](http://book.paddlepaddle.org/index.en.html) and [Chinese](http://book.paddlepaddle.org/).
+* Support rectangle input for CNN.
+* Support stride pooling for seqlastin and seqfirstin.
+* Expose `seq_concat_layer/seq_reshape_layer` in `trainer_config_helpers`.
+* Add dataset package: CIFAR, MNIST, IMDB, WMT14, CONLL05, movielens, imikolov.
+* Add Priorbox layer for Single Shot Multibox Detection. 
+* Add smooth L1 cost.
+* Add data reader creator and data reader decorator for v2 API.
+* Add the CPU implementation of cmrnorm projection.
+
+## Improvements
+
+* Support Python virtualenv for `paddle_trainer`.
+* Add pre-commit hooks, used for automatically format our code.
+* Upgrade protobuf to version 3.x.
+* Add an option to check data type in Python data provider.
+* Speedup the backward of average layer on GPU.
+* Documentation refinement.
+* Check dead links in documents using Travis-CI.
+* Add a example for explaining `sparse_vector`.
+* Add ReLU in layer_math.py
+* Simplify data processing flow for Quick Start.
+* Support CUDNN Deconv.
+* Add data feeder in v2 API.
+* Support predicting the samples from sys.stdin for sentiment demo.
+* Provide multi-proccess interface for image preprocessing. 
+* Add benchmark document for v1 API.
+* Add ReLU in `layer_math.py`.
+* Add packages for automatically downloading public datasets.
+* Rename `Argument::sumCost` to `Argument::sum` since class `Argument` is nothing with cost.
+* Expose Argument::sum to Python
+* Add a new `TensorExpression` implementation for matrix-related expression evaluations.
+* Add lazy assignment for optimizing the calculation of a batch of multiple expressions.
+* Add abstract calss `Function` and its implementation:
+  * `PadFunc` and `PadGradFunc`.
+  * `ContextProjectionForwardFunc` and `ContextProjectionBackwardFunc`.
+  * `CosSimBackward` and `CosSimBackwardFunc`.
+  * `CrossMapNormalFunc` and `CrossMapNormalGradFunc`.
+  * `MulFunc`.
+* Add class `AutoCompare` and `FunctionCompare`, which make it easier to write unit tests for comparing gpu and cpu version of a function.
+* Generate `libpaddle_test_main.a` and remove the main function inside the test file.
+* Support dense numpy vector in PyDataProvider2.
+* Clean code base, remove some copy-n-pasted code snippets:
+  * Extract `RowBuffer` class for `SparseRowMatrix`.
+  * Clean the  interface of `GradientMachine`.
+  * Use `override` keyword in layer.
+  * Simplify `Evaluator::create`, use `ClassRegister` to create `Evaluator`s.
+* Check MD5 checksum when downloading demo's dataset.
+* Add `paddle::Error` which intentially replace `LOG(FATAL)` in Paddle.
+
+## Bug Fixes
+
+* Check layer input types for `recurrent_group`.
+* Don't run `clang-format` with .cu source files.
+* Fix bugs with `LogActivation`.
+* Fix the bug that runs `test_layerHelpers` multiple times.
+* Fix the bug that the seq2seq demo exceeds protobuf message size limit.
+* Fix the bug in dataprovider converter in GPU mode.
+* Fix a bug in `GatedRecurrentLayer`.
+* Fix bug for `BatchNorm` when testing more than one models.
+* Fix broken unit test of paramRelu.
+* Fix some compile-time warnings about `CpuSparseMatrix`.
+* Fix `MultiGradientMachine` error when `trainer_count > batch_size`.
+* Fix bugs that prevents from asynchronous data loading in `PyDataProvider2`.
+
 # Release v0.9.0

 ## New Features:
--- a/3
+++ b/3
@ -29,13 +29,16 @@ Luo, Tao
 Lyu, Qin
 Mao, Hongyue
 Qian, Xiaojun
+Qiao, Longfei
 Qi, Jun
 Qin, Duohao
 Shen, Guolong
 Shi, Guangchuan
 Song, Xiang
+Wang, Helin
 Wang, Jiang
 Wang, Yanfei
+Wang, Yi
 Wang, Yong
 Weng, Renliang
 Xu, Tianbing
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@ -0,0 +1,9 @@
+paddle/image/logs
+paddle/image/*.pyc
+paddle/image/train.list
+paddle/rnn/logs
+paddle/rnn/*.pyc
+paddle/rnn/imdb.pkl
+caffe/image/logs
+tensorflow/image/logs
+tensorflow/rnn/logs
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -0,0 +1,168 @@
+# Benchmark
+
+Machine: 
+
+- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
+- GPU: Tesla K40m
+- cuDNN: v5.1
+- system: Docker 1.12.1, all platforms are tested in docker environment.
+
+Platforms: 
+
+- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
+- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
+- Caffe: kaixhin/cuda-caffe
+
+Several convolutional neural networks and recurrent neural networks are used to test.
+
+## Image
+
+### Benchmark Model
+
+AlexNet, GoogleNet and a small network used in Caffe.
+
+- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
+
+- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
+
+- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
+
+
+### Single-GPU
+
+- AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
+
+| BatchSize    | 64  | 128  | 256   | 512  |
+|--------------|-----| -----| ------| -----|
+| PaddlePaddle | 195 | 334  | 602   | 1629 |
+| TensorFlow   | 223 | 364  | 645   | 1235 |
+| Caffe        | 324 | 627  | 1232  | 2513 |
+ 
+**Notation**
+
+All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
+ 
+- GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
+
+
+| BatchSize    | 64    |   128  | 256     |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 613   | 1149   | 2348    |
+| TensorFlow   | 644   | 1176   | 2219    |
+| Caffe        | 694   | 1364   | out of memory   |
+
+- SmallNet: input - 3 * 32 * 32, Time ms/batch
+
+| BatchSize    | 64     |   128    | 256     | 512     |
+|--------------|--------| -------- | --------|---------|
+| PaddlePaddle | 10.463 | 18.184   | 33.113  |  63.039 |
+| TensorFlow   | 9     | 15       | 28      | 59       |
+| Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
+
+**Notation**
+
+All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
+
+In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
+
+### Multi-GPU: 4 GPUs
+
+- AlexNet,  ms / batch
+
+| total-BatchSize | 128 * 4  | 256 * 4    |
+|------------------|----------| -----------|
+| PaddlePaddle     | 347      | 622        |
+| TensorFlow       | 377      | 675        |
+| Caffe            | 1229     | 2435       |
+
+For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
+
+```
+  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
+= (334 * 4)/347 
+= 3.85
+``` 
+
+<img src="figs/alexnet-4gpu.png" width="420">
+
+
+- GoogleNet, ms / batch
+
+| total-BatchSize  | 128 * 4      |  256 * 4    |
+|-------------------|--------------| ----------- |
+| PaddlePaddle      | 1178         | 2367        |
+| TensorFlow        | 1210         | 2292        |
+| Caffe             | 2007         | out of memory  |
+
+<img src="figs/googlenet-4gpu.png" width="420">
+
+
+## RNN
+We use lstm network for text classfication to test benchmark.
+
+### Dataset
+-  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
+- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
+- Dictionary size=30000 
+- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
+
+### Single-GPU
+
+#### LSTM in Text Classification
+
+Testing `2 lstm layer + fc` network with different hidden size and batch size.
+  
+- Batch size = 64, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 83    | 184    | 641     |
+| TensorFlow   | 175   | 280    | 818     |
+
+- Batch size = 128, ms / batch
+ 
+| hidden_size  | 256    | 512    |  1280   |
+|--------------|------- | -------| --------|
+| PaddlePaddle | 110    | 261    | 1007    |
+| TensorFlow   | 181    | 361    | 1237    |
+
+
+- Batch size = 256, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 170   | 414    | 1655    |
+| TensorFlow   | 238   | 536    | 1905    |
+
+<img src="figs/rnn_lstm_cls.png" width="600">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
+ 
+
+### Multi GPU: 4 GPUs
+
+#### LSTM in Text Classification
+
+- hidden_size = 256, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 90     | 118     |
+| TensorFlow   | 226    | 118     |
+
+
+- hidden_size = 512, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 189    | 268     |
+| TensorFlow   | 297    | 383     |
+
+
+<img src="figs/rnn_lstm_4gpus.png" width="420">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
--- a/benchmark/caffe/image/alexnet.prototxt
+++ b/benchmark/caffe/image/alexnet.prototxt
--- a/benchmark/caffe/image/googlenet.prototxt
+++ b/benchmark/caffe/image/googlenet.prototxt
--- a/benchmark/caffe/image/run.sh
+++ b/benchmark/caffe/image/run.sh
@ -0,0 +1,30 @@
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
+  caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 64 alexnet 
+test alexnet.prototxt 128 alexnet 
+test alexnet.prototxt 256 alexnet 
+test alexnet.prototxt 512 alexnet 
+
+# googlenet
+test googlenet.prototxt 64 googlenet 
+test googlenet.prototxt 128 googlenet 
+
+# small net 
+test smallnet_mnist_cifar.prototxt 64 smallnet 
+test smallnet_mnist_cifar.prototxt 128 smallnet 
+test smallnet_mnist_cifar.prototxt 256 smallnet 
+test smallnet_mnist_cifar.prototxt 512 smallnet 
--- a/benchmark/caffe/image/run_multi.sh
+++ b/benchmark/caffe/image/run_multi.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  batch_per_gpu=`expr ${batch} / 4`
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "1c\net : \"${cfg}\"" solver.prototxt
+  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 512 alexnet 
+test alexnet.prototxt 1024 alexnet 
+
+# googlnet 
+test googlenet.prototxt 512 googlenet 
--- a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
+++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
@ -0,0 +1,198 @@
+name: "mnist/cifar"
+input: "data"
+input_dim: 128 
+input_dim: 3
+input_dim: 32 
+input_dim: 32 
+input: "label"
+input_dim: 128 
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.0001
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3"
+  top: "pool3"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool3"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 64
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "ip2"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "ip2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
--- a/benchmark/caffe/image/solver.prototxt
+++ b/benchmark/caffe/image/solver.prototxt
@ -0,0 +1,10 @@
+net: "alexnet.prototxt"
+base_lr: 0.01
+lr_policy: "fixed"
+display: 20
+max_iter: 200
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "models/caffe_alexnet_train"
+solver_mode: GPU
--- a/benchmark/figs/alexnet-4gpu.png
+++ b/benchmark/figs/alexnet-4gpu.png
--- a/benchmark/figs/googlenet-4gpu.png
+++ b/benchmark/figs/googlenet-4gpu.png
--- a/benchmark/figs/rnn_lstm_4gpus.png
+++ b/benchmark/figs/rnn_lstm_4gpus.png
--- a/benchmark/figs/rnn_lstm_cls.png
+++ b/benchmark/figs/rnn_lstm_cls.png
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+
+height = 227
+width = 227
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+# conv1
+net = data_layer('data', size=height * width * 3)
+net = img_conv_layer(
+    input=net,
+    filter_size=11,
+    num_channels=3,
+    num_filters=96,
+    stride=4,
+    padding=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv2
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv3
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
+# conv4
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+
+# conv5
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=net, label=lab)
+outputs(loss)
--- a/Show More
+++ b/Show More