Merge commit '12b619343b0e8e35cda6335f8230c4168277b8dd'

9 years ago · d539e78012
parent a14393f2f1 12b619343b
commit d539e78012
270 changed files with 5977 additions and 2534 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,6 @@
 *.DS_Store
 build/
+*.user
+
+.vscode
+.idea
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,38 @@
+language: cpp
+cache: ccache
+sudo: required
+dist: trusty
+env:
+  - JOB=DOCS
+  - JOB=BUILD_AND_TEST
+addons:
+  apt:
+    packages:
+      - gcc-4.8
+      - g++-4.8
+      - wget
+      - git
+      - build-essential
+      - libatlas-base-dev
+      - python
+      - python-pip
+      - python2.7-dev
+      - m4
+      - libprotobuf-dev
+      - doxygen
+      - protobuf-compiler
+      - python-protobuf
+      - python-numpy
+      - python-wheel
+      - libgoogle-glog-dev
+      - libgflags-dev
+      - libgtest-dev
+before_install:
+  - pip install wheel protobuf sphinx breathe recommonmark
+  - sudo paddle/scripts/travis/before_install.sh
+script:
+  - paddle/scripts/travis/main.sh
+notifications:
+  email:
+    on_success: change
+    on_failure: always
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b)
+set(PADDLE_PATCH_VERSION 0b1)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@ -14,8 +14,10 @@ find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
-find_package(NumPy)
+find_package(ZLIB REQUIRED)
+find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
+find_package(AVX QUIET)
 find_package(Glog)
 find_package(Gflags QUIET)
 find_package(GTest)
@ -27,7 +29,7 @@ find_program(M4_EXECUTABLE m4)
 option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
 option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
 option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
-option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ON) # TODO(yuyang18): Check AVX is supported or not as default value
+option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
@ -37,6 +39,7 @@ option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
+option(ON_TRAVIS "Running test on travis-ci or not." OFF)
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@ -99,8 +102,8 @@ if(NOT WITH_TIMER)
 endif(NOT WITH_TIMER)

 if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
 else(WITH_AVX)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
--- a/README.md
+++ b/README.md
@ -1,4 +1,11 @@
 # PaddlePaddle
+[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
+
+Welcome to the PaddlePaddle GitHub.
+
+The software will be released on Sept. 30 with full documentation and installation support. 
+
+A pre-release version is available now for those who are eager to take a look.

 PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
 efficient, flexible and scalable deep learning platform, which is originally
--- a/cmake/FindAVX.cmake
+++ b/cmake/FindAVX.cmake
@ -0,0 +1,65 @@
+# This file is use to check all support level of AVX on your machine
+# so that PaddlePaddle can unleash the vectorization power of muticore.
+
+INCLUDE(CheckCXXSourceRuns)
+
+SET(FIND_AVX_10)
+SET(FIND_AVX_20)
+SET(AVX_FLAGS)
+SET(AVX_FOUND)
+
+# Check AVX 2
+SET(CMAKE_REQUIRED_FLAGS)
+IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  SET(CMAKE_REQUIRED_FLAGS "-mavx2")
+ELSEIF(MSVC AND NOT CMAKE_CL_64)  # reserve for WINDOWS
+  SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+ENDIF()
+
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" FIND_AVX_20)
+
+# Check AVX
+SET(CMAKE_REQUIRED_FLAGS)
+IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    SET(CMAKE_REQUIRED_FLAGS "-mavx")
+ELSEIF(MSVC AND NOT CMAKE_CL_64)
+    SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+endif()
+
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
+    return 0;
+}" FIND_AVX_10)
+
+IF(${FIND_AVX_20})
+    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
+    ELSEIF(MSVC)
+        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
+    ENDIF()
+ENDIF()
+
+IF(${FIND_AVX_10})
+    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
+    ELSEIF(MSVC)
+        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
+    ENDIF()
+ENDIF()
+
+IF("${FIND_AVX_10}" OR "${FIND_AVX_20}")
+    SET(AVX_FOUND TRUE)
+    MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
+ENDIF()
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
        ${OPENBLAS_ROOT}/include
        /usr/include
-        /usr/include/openblas)
+        /usr/include/openblas
+        /usr/local/opt/openblas/include)
 set(OPENBLAS_LIB_SEARCH_PATHS
        ${OPENBLAS_ROOT}/lib
        /usr/lib
        /usr/lib/blas/openblas
-        /usr/lib/openblas)
+        /usr/lib/openblas
+        /usr/local/opt/openblas/lib)

 find_path(OPENBLAS_INC_DIR NAMES cblas.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
    /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
          NO_DEFAULT_PATH
    DOC "Path to cuDNN library.")
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -8,7 +8,7 @@ include(CheckCXXSymbolExists)
 # is_c: is C flag or C++ flag, bool type.
 # src_list: The list name which the flag name will be append to.
 # flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc
-# rest arguments: not used. 
+# rest arguments: not used.
 function(safe_set_flag is_c src_list flag_name)
    string(REPLACE "-" "_" safe_name ${flag_name})
    string(REPLACE "=" "_" safe_name ${safe_name})
@ -44,7 +44,7 @@ CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
  CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
-  if(UINT64_MAX_EXISTS_HERE) 
+  if(UINT64_MAX_EXISTS_HERE)
    set(CMAKE_REQUIRED_DEFINITIONS)
    add_definitions(-D__STDC_LIMIT_MACROS)
  else()
@ -74,13 +74,37 @@ endforeach()
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.

+function(specify_cuda_arch cuda_version cuda_arch)
+    if(${cuda_version} VERSION_GREATER "8.0")
+        foreach(capability 61 62)
+          if(${cuda_arch} STREQUAL ${capability})
+            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
+          endif()
+        endforeach()
+    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
+        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
+    endif()
+endfunction()
+
+# Common gpu architectures: Kepler, Maxwell
 foreach(capability 30 35 50)
-    list(APPEND __arch_flags "-gencode arch=compute_${capability},code=sm_${capability}")
+    list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
 endforeach()

 if (CUDA_VERSION VERSION_GREATER "7.0")
-    list(APPEND __arch_flags "-gencode arch=compute_52,code=sm_52")
+      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
 endif()

-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+# Modern gpu architectures: Pascal
+if (CUDA_VERSION VERSION_GREATER "8.0")
+      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
+endif()

+# Custom gpu architecture
+set(CUDA_ARCH)
+
+if(CUDA_ARCH)
+  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
+endif()
+
+set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -1,16 +1,55 @@
 # Some common routine for paddle compile.

-
 # target_circle_link_libraries
 # Link libraries to target which has circle dependencies.
 #
 # First Argument: target name want to be linked with libraries
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
-    target_link_libraries(${TARGET_NAME}
-        -Wl,--start-group
-        ${ARGN}
-        -Wl,--end-group)
+    if(APPLE)
+        set(LIBS)
+        set(inArchive OFF)
+        set(libsInArgn)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                set(inArchive ON)
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                set(inArchive OFF)
+            else()
+                if(inArchive)
+                    list(APPEND LIBS "-Wl,-force_load")
+                endif()
+                list(APPEND LIBS ${arg})
+                list(APPEND libsInArgn ${arg})
+            endif()
+        endforeach()
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            list(APPEND LIBS "-undefined dynamic_lookup")
+        endif()
+        list(REVERSE libsInArgn)
+        target_link_libraries(${TARGET_NAME}
+            ${LIBS}
+            ${libsInArgn})
+
+    else()  # LINUX
+        set(LIBS)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                list(APPEND LIBS "-Wl,--whole-archive")
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                list(APPEND LIBS "-Wl,--no-whole-archive")
+            else()
+                list(APPEND LIBS ${arg})
+            endif()
+        endforeach()
+
+        target_link_libraries(${TARGET_NAME}
+                "-Wl,--start-group"
+                ${LIBS}
+                "-Wl,--end-group")
+    endif()
 endfunction()

 # compile_cu_as_cpp
@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME)
    if(PADDLE_WITH_INTERNAL)
        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
        target_circle_link_libraries(${TARGET_NAME}
-            -Wl,--whole-archive
+            ARCHIVE_START
            paddle_internal_gserver
            paddle_internal_owlqn
-            -Wl,--no-whole-archive
+            ARCHIVE_END
            paddle_internal_parameter)
    else()
        set(INTERAL_LIBS "")
    endif()

    target_circle_link_libraries(${TARGET_NAME}
-        -Wl,--whole-archive
+        ARCHIVE_START
        paddle_gserver
        ${METRIC_LIBS}
-        -Wl,--no-whole-archive
+        ARCHIVE_END
        paddle_pserver
        paddle_trainer_lib
        paddle_network
@ -67,9 +106,9 @@ function(link_paddle_exe TARGET_NAME)
        ${PROTOBUF_LIBRARY}
        ${CMAKE_THREAD_LIBS_INIT}
        ${CBLAS_LIBS}
-        ${CMAKE_DL_LIBS}
+        ${ZLIB_LIBRARIES}
        ${INTERAL_LIBS}
-        -lz)
+        ${CMAKE_DL_LIBS})
    
    if(WITH_PYTHON)
        target_link_libraries(${TARGET_NAME}
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@ -20,9 +20,8 @@ from optparse import OptionParser

 import paddle.utils.image_util as image_util

-from py_paddle import swig_paddle, util
-from py_paddle import DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config

 logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
@ -75,8 +74,8 @@ class ImageClassifier():
        self.network.loadParameters(self.model_dir)

        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [DenseSlot(data_size)]
-        self.converter = util.DataProviderWrapperConverter(False, slots)
+        slots = [dense_vector(data_size)]
+        self.converter = DataProviderConverter(slots)

    def get_data(self, img_path):
        """
--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
@ -14,8 +14,6 @@
 # limitations under the License.
 set -e

-export PYTHONPATH=$PYTHONPATH:../../
-
 data_dir=./data/cifar-out

 python preprocess.py -i $data_dir -s 32 -c 1
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@ -22,9 +22,8 @@ from optparse import OptionParser

 import paddle.utils.image_util as image_util

-from py_paddle import swig_paddle, util
-from py_paddle import DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config

 logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
@ -85,9 +84,8 @@ class ImageClassifier():
        self.network.loadParameters(self.model_dir)

        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [DenseSlot(data_size)]
-        is_sequence = False
-        self.converter = util.DataProviderWrapperConverter(is_sequence, slots)
+        slots = [dense_vector(data_size)]
+        self.converter = DataProviderConverter(slots)

    def get_data(self, img_path):
        """
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -33,7 +33,7 @@ test_num=$((min_len/10))
 if [ $test_num -gt 12500 ];then
 test_num=12500
 fi
-train_num=((min_len-test_num))
+train_num=$((min_len-test_num))

 head -n$train_num pos.shuffed >train.pos
 head -n$train_num neg.shuffed >train.neg
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@ -12,15 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
 from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse

-
 def hook(settings, meta, **kwargs):
    """
    Init hook is invoked before process data. It will set obj.slots and store
@ -47,7 +41,6 @@ def hook(settings, meta, **kwargs):
    settings.input_types = headers
    settings.meta = meta

-
@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):
    with open(filename, 'r') as f:
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@ -15,12 +15,12 @@
 import os
 import numpy as np
 from optparse import OptionParser
-from py_paddle import swig_paddle, util, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
 from paddle.trainer.config_parser import parse_config
 """
 Usage: run following command to show help message.
-  python predict.py -h 
+  python predict.py -h
 """
 UNK_IDX = 0

@ -43,16 +43,22 @@ class Prediction():

        conf = parse_config(
            train_conf,
-            'dict_len=' + str(len_dict) + 
+            'dict_len=' + str(len_dict) +
            ',label_len=' + str(len_label) +
            ',is_predict=True')
        self.network = swig_paddle.GradientMachine.createFromConfigProto(
            conf.model_config)
        self.network.loadParameters(model_dir)

-        slots = [IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(len_dict),
-                 IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(2)]
-        self.converter = util.DataProviderWrapperConverter(True, slots)
+        slots = [
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(2)
+        ]
+        self.converter = DataProviderConverter(slots)

    def load_dict_label(self, dict_file, label_file):
        """
@ -109,7 +115,7 @@ class Prediction():


 def option_parser():
-    usage = ("python predict.py -c config -w model_dir " 
+    usage = ("python predict.py -c config -w model_dir "
             "-d word dictionary -l label_file -i input_file")
    parser = OptionParser(usage="usage: %s [options]" % usage)
    parser.add_option(
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@ -15,13 +15,13 @@
 import os
 import numpy as np
 from optparse import OptionParser
-from py_paddle import swig_paddle, util, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
 from paddle.trainer.config_parser import parse_config

 """
 Usage: run following command to show help message.
-  python predict.py -h 
+  python predict.py -h
 """

 class SentimentPrediction():
@ -46,8 +46,8 @@ class SentimentPrediction():
        conf = parse_config(train_conf, "is_predict=1")
        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
        self.network.loadParameters(self.model_dir)
-        slots = [IndexSlot(self.dict_dim)]
-        self.converter = util.DataProviderWrapperConverter(True, slots)
+        slots = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(slots)

    def load_dict(self):
        """
--- a/demo/sentiment/sentiment_net.py
+++ b/demo/sentiment/sentiment_net.py
@ -65,7 +65,7 @@ def bidirectional_lstm_net(input_dim,
    bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
    dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
    output = fc_layer(input=dropout, size=class_dim,
-                      act_type=SoftmaxActivation())
+                      act=SoftmaxActivation())

    if not is_predict:
        lbl = data_layer("label", 1)
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@ -128,12 +128,16 @@ def gru_encoder_decoder(data_conf,
        return out

    decoder_group_name = "decoder_group"
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
+
    if not is_generating:
        trg_embedding = embedding_layer(
            input=data_layer(name='target_language_word',
                             size=target_dict_dim),
            size=word_vector_dim,
            param_attr=ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)

        # For decoder equipped with attention mechanism, in training,
        # target embeding (the groudtruth) is the data input,
@ -142,22 +146,13 @@ def gru_encoder_decoder(data_conf,
        # for the recurrent_group.
        decoder = recurrent_group(name=decoder_group_name,
                                  step=gru_decoder_with_attention,
-                                  input=[
-                                      StaticInput(input=encoded_vector,
-                                                  is_seq=True),
-                                      StaticInput(input=encoded_proj,
-                                                  is_seq=True), trg_embedding
-                                  ])
+                                  input=group_inputs)

        lbl = data_layer(name='target_language_next_word',
                         size=target_dict_dim)
-        cost = classification_cost(input=decoder, label=lbl, )
+        cost = classification_cost(input=decoder, label=lbl)
        outputs(cost)
    else:
-        gen_inputs = [StaticInput(input=encoded_vector,
-                                  is_seq=True),
-                      StaticInput(input=encoded_proj,
-                                  is_seq=True), ]
        # In generation, the decoder predicts a next target word based on
        # the encoded source sequence and the last generated target word.

@ -171,16 +166,18 @@ def gru_encoder_decoder(data_conf,
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
-        gen_inputs.append(trg_embedding)
+        group_inputs.append(trg_embedding)
+
        beam_gen = beam_search(name=decoder_group_name,
                               step=gru_decoder_with_attention,
-                               input=gen_inputs,
-                               id_input=data_layer(name="sent_id",
-                                                   size=1),
-                               dict_file=trg_dict_path,
+                               input=group_inputs,
                               bos_id=0,
                               eos_id=1,
                               beam_size=beam_size,
-                               max_length=max_length,
-                               result_file=gen_trans_file)
+                               max_length=max_length)
+
+        seqtext_printer_evaluator(input=beam_gen,
+                                  id_input=data_layer(name="sent_id", size=1),
+                                  dict_file=trg_dict_path,
+                                  result_file=gen_trans_file)
        outputs(beam_gen)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -1,6 +1,3 @@
-
-
-
 if(NOT DEFINED SPHINX_THEME)
    set(SPHINX_THEME default)
 endif()
@ -46,4 +43,4 @@ sphinx_add_target(paddle_docs

 add_dependencies(paddle_docs 
  gen_proto_py
-  paddle_doxygen_docs)
+  paddle_doxygen_docs)
--- a/doc/algorithm/rnn/index.rst
+++ b/doc/algorithm/rnn/index.rst
@ -30,7 +30,7 @@ Then at the :code:`process` function, each :code:`yield` function will return th
    yield src_ids, trg_ids, trg_ids_next


-For more details description of how to write a data provider, please refer to :doc:`Python Data Provider <../py_data_provider_wrapper>`. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+For more details description of how to write a data provider, please refer to `PyDataProvider2 <../../ui/data_provider/index.html>`_. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.

 ===============================================
 Configure Recurrent Neural Network Architecture
@ -106,7 +106,7 @@ We will use the sequence to sequence model with attention as an example to demon

 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.

-The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :doc:`Layers <../trainer_config_helpers/layers>`  for more details.
+The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to `Layers <../../ui/api/trainer_config_helpers/layers_index.html>`_  for more details.

 We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space:

@ -143,11 +143,15 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.

 .. code-block:: python

+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
    trg_embedding = embedding_layer(
        input=data_layer(name='target_language_word',
                         size=target_dict_dim),
        size=word_vector_dim,
        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
    # For decoder equipped with attention mechanism, in training,
    # target embedding (the groudtruth) is the data input,
    # while encoded source sequence is accessed to as an unbounded memory.
@ -156,13 +160,7 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
    # All sequence inputs should have the same length.
    decoder = recurrent_group(name=decoder_group_name,
                              step=gru_decoder_with_attention,
-                              input=[
-                                  StaticInput(input=encoded_vector,
-                                              is_seq=True),
-                                  StaticInput(input=encoded_proj,
-                                              is_seq=True),
-                                  trg_embedding
-                              ])
+                              input=group_inputs)


 The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
@ -205,22 +203,23 @@ After training the model, we can use it to generate sequences. A common practice
 * use :code:`GeneratedInput` for trg_embedding. :code:`GeneratedInput` computes the embedding of the generated token at the last time step for the input at the current time step.
 * use :code:`beam_search` function. This function needs to set:

-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
  - :code:`bos_id`: the start token. Every sentence starts with the start token.
  - :code:`eos_id`: the end token. Every sentence ends with the end token.
  - :code:`beam_size`: the beam size used in beam search.
  - :code:`max_length`: the maximum length of the generated sentences.
-  - :code:`result_file`: the path of the generation result file.

+* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
+
+  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
+  - :code:`dict_file`: the dictionary file for converting word id to word.
+  - :code:`result_file`: the path of the generation result file.
+    
 The code is listed below:

 .. code-block:: python

-    gen_inputs = [StaticInput(input=encoded_vector,
-                              is_seq=True),
-                  StaticInput(input=encoded_proj,
-                              is_seq=True), ]
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
    # In generation, decoder predicts a next target word based on
    # the encoded source sequence and the last generated target word.
    # The encoded source sequence (encoder's output) must be specified by
@ -231,21 +230,22 @@ The code is listed below:
        size=target_dict_dim,
        embedding_name='_target_language_embedding',
        embedding_size=word_vector_dim)
-    gen_inputs.append(trg_embedding)
+    group_inputs.append(trg_embedding)
    beam_gen = beam_search(name=decoder_group_name,
                           step=gru_decoder_with_attention,
-                           input=gen_inputs,
-                           id_input=data_layer(name="sent_id",
-                                               size=1),
-                           dict_file=trg_dict_path,
+                           input=group_inputs,
                           bos_id=0, # Beginnning token.
                           eos_id=1, # End of sentence token.
                           beam_size=beam_size,
-                           max_length=max_length,
-                           result_file=gen_trans_file)
+                           max_length=max_length)
+
+    seqtext_printer_evaluator(input=beam_gen,
+                              id_input=data_layer(name="sent_id", size=1),
+                              dict_file=trg_dict_path,
+                              result_file=gen_trans_file)
    outputs(beam_gen)


-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :doc:`Semantic Role Labeling Demo <../../../demo/semantic_role_labeling>` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `Semantic Role Labeling Demo <../../demo/semantic_role_labeling/index.html>`_ for more details.

 The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@ -25,9 +25,12 @@ repo or just head straight to the command line:
 
 ```shell
 # Clone your fork to your local machine
-git clone git@github.com:USERNAME/paddle.git
+git clone https://github.com/USERNAME/Paddle.git
+```
+Then you can start to develop by making a local developement branch
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH origin/master
 ```
-Then you can start to develop. 

 ## Commit

@ -45,14 +48,14 @@ are the details if any.

 ## Keeping Fork Up to Date

-Before pull your request, you shold sync you code from the latest PaddlePaddle.
+Before pull your request, you should sync your code from the latest PaddlePaddle.
 To do this, you'll need to add a remote at first:

 ```shell
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/paddle/paddle.git
+git remote add upstream https://github.com/baidu/Paddle.git
 # verify the new upstream
 git remote -v
 ```
@ -60,8 +63,7 @@ git remote -v
 Update your fork with the latest upstream changes:

 ```shell
-git fetch upstream
-git pull upstream master
+git pull --rebase upstream HEAD
 ```

 If there are no unique commits locally, git will simply perform a fast-forward.
@ -74,10 +76,26 @@ Now, your local master branch is up-to-date with everything modified upstream.

 ```shell
 # push to your repository in Github
-git push origin master
+git push origin HEAD
 ```

 ## Pull Request

 Go to the page for your fork on GitHub, select your development branch,
 and click the **pull request button**.
+
+## Update your pull request with the lastest version
+
+During the code review, your pull request may become stale because new commits in
+baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
+by clicking the "Update Branch" button in your pull request page. However, in the case
+of conflict, you need to do the update manually. You need to do the following on
+your local repository:
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull --rebase upstream HEAD
+# You may need to resolve the conflict according to the git prompt.
+# Make and test your code.
+git push -f origin HEAD
+```
+Now your Pull Request is updated with the latest version.
--- a/doc/build/docker_install.md
+++ b/doc/build/docker_install.md
@ -0,0 +1,91 @@
+Docker installation guide
+====================
+PaddlePaddle provides some pre-compiled binary, including Docker images, ubuntu deb packages. It is welcomed to contributed more installation package of different linux distribution (such as ubuntu, centos, debian, gentoo and so on). We recommend to use Docker images to deploy PaddlePaddle.
+## Docker installation
+
+Docker is a tool designed to make it easier to create, deploy, and run applications by using containers.
+
+### PaddlePaddle Docker images
+There are six Docker images:
+
+- paddledev/paddle:cpu-latest: PaddlePaddle CPU binary image.
+- paddledev/paddle:gpu-latest: PaddlePaddle GPU binary image.
+- paddledev/paddle:cpu-devel-latest: PaddlePaddle CPU binary image plus source code.
+- paddledev/paddle:gpu-devel-latest: PaddlePaddle GPU binary image plus source code.
+- paddledev/paddle:cpu-demo-latest: PaddlePaddle CPU binary image plus source code and demo
+- paddledev/paddle:gpu-demo-latest: PaddlePaddle GPU binary image plus source code and demo
+
+Tags with latest will be replaced by a released version. 
+
+### Download and Run Docker images
+
+You have to install Docker in your machine which has linux kernel version 3.10+ first. You can refer to the official guide https://docs.docker.com/engine/installation/ for further information.
+
+You can use ```docker pull ```to download images first, or just launch a container with ```docker run```:
+```bash
+docker run -it paddledev/paddle:cpu-latest
+```
+
+If you want to launch container with GPU support, you need to set some environment variables at the same time:
+
+```bash
+export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+docker run -it paddledev/paddle:gpu-latest
+``` 
+
+### Notice
+
+#### Performance
+
+Since Docker is based on the lightweight virtual containers, the CPU computing performance maintains well. And GPU driver and equipments are all mapped to the container, so the GPU computing performance would not be seriously affected.
+
+If you use high performance nic, such as RDMA(RoCE 40GbE or IB 56GbE), Ethernet(10GbE), it is recommended to use config "-net = host".
+
+
+
+
+#### Remote access
+If you want to enable ssh access background, you need to build an image by yourself. Please refer to official guide https://docs.docker.com/engine/reference/builder/ for further information.
+
+Following is a simple Dockerfile with ssh:
+```bash
+FROM paddledev/paddle
+
+MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+
+RUN apt-get update
+RUN apt-get install -y openssh-server
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+EXPOSE 22
+
+CMD    ["/usr/sbin/sshd", "-D"]
+```
+
+Then you can build an image with Dockerfile and launch a container:
+
+```bash
+# cd into Dockerfile directory
+docker build . -t paddle_ssh
+# run container, and map host machine port 8022 to container port 22
+docker run -d -p 8022:22 --name paddle_ssh_machine paddle_ssh
+```
+Now, you can ssh on port 8022 to access the container, username is root, password is also root:
+
+```bash
+ssh -p 8022 root@YOUR_HOST_MACHINE
+```
+
+
+You can stop and delete the container as following:
+```bash
+# stop
+docker stop paddle_ssh_machine
+# delete
+docker rm paddle_ssh_machine
+```
--- a/doc/build/index.rst
+++ b/doc/build/index.rst
@ -5,9 +5,11 @@ Install PaddlePaddle
 ----------------------

 ..  toctree::
+    :maxdepth: 1
    :glob:

    install_*
+    internal/install_from_jumbo.md

 Build from Source
 -----------------
@ -15,20 +17,24 @@ Build from Source
 If you want to hack and contribute PaddlePaddle source code, following guides can help you\:

 ..  toctree::
+    :maxdepth: 1
    :glob:

    build_from_source.md
    contribute_to_paddle.md

-Build Docker Images
-------------------
+Docker and Debian Package installation
+--------------------------------------

-Note: The intallation packages are still in pre-release 
+Note: The installation packages are still in pre-release 
 state and your experience of installation may not be smooth.

 If you want to pack docker image, the following guide can help you\:

 ..  toctree::
+    :maxdepth: 1
    :glob:

-    docker/*
+    docker_install.md
+    ubuntu_install.md
+
--- a/doc/build/ubuntu_install.md
+++ b/doc/build/ubuntu_install.md
@ -0,0 +1,21 @@
+Debian Package installation guide
+=================================
+
+## Debian Package installation
+Currently , PaddlePaddle only provides ubuntu14.04 debian packages.
+There are two versions package, including CPU and GPU. The download address is:
+
+https://github.com/baidu/Paddle/releases/tag/V0.8.0b0
+
+
+After downloading PaddlePaddle deb packages, you can run:
+
+```bash
+dpkg -i paddle-0.8.0b-cpu.deb
+apt-get install -f
+```
+And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
+
+**Note**
+
+PaddlePaddle package only supports x86 CPU with AVX instructions. If not, you have to download and build from source code.
--- a/Show More
+++ b/Show More