Merge branch 'develop' of github.com:PaddlePaddle/Paddle into rnn_varilen_design

8 years ago · ddafe5ceeb
parent 1328060abc da4ce23241
commit ddafe5ceeb
88 changed files with 2635 additions and 516 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,6 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@ -74,6 +76,10 @@ if(ANDROID)
        "Disable PYTHON when cross-compiling for Android" FORCE)
    set(WITH_RDMA OFF CACHE STRING
        "Disable RDMA when cross-compiling for Android" FORCE)
+    set(WITH_MKLDNN OFF CACHE STRING
+        "Disable MKLDNN when cross-compiling for Android" FORCE)
+    set(WITH_MKLML OFF CACHE STRING
+        "Disable MKLML package when cross-compiling for Android" FORCE)
 endif(ANDROID)

 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@ -87,6 +93,7 @@ endif()

 ########################################################################################

+include(external/mklml)     # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@ -94,6 +101,7 @@ include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
@ -135,6 +143,10 @@ if(WITH_GPU)
    endif(NOT WITH_DSO)
 endif(WITH_GPU)

+if(WITH_MKLDNN)
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
+endif()
+
 if(USE_NNPACK)
    include(external/nnpack)
    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -15,23 +15,44 @@

 set(CBLAS_FOUND OFF)

-## Find MKL First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
+## Find MKLML First.
+if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER MKLML)
+  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKLML_LIB})
+
+  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found cblas and lapack in MKLML "
+    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  return()
+endif()
+
+## Then find MKL.
+set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
+set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
+
+set(MKL_INCLUDE_SEARCH_PATHS
+  ${MKL_ROOT}/include
+  ${INTEL_MKL_ROOT}/include)
+set(MKL_LIB_SEARCH_PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64
+  ${INTEL_MKL_ROOT}/lib
+  ${INTEL_MKL_ROOT}/lib/intel64)

 find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})

 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
  set(CBLAS_FOUND ON)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -67,6 +67,30 @@ else()
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)

+if(WITH_MKLDNN)
+    add_definitions(-DPADDLE_USE_MKLDNN)
+    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
+        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+        set(OPENMP_FLAGS "-fopenmp")
+        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+    else()
+        find_package(OpenMP)
+        if(OPENMP_FOUND)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        else()
+            message(WARNING "Can not find OpenMP."
+                 "Some performance features in MKLDNN may not be available")
+        endif()
+    endif()
+
+endif(WITH_MKLDNN)
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@ -34,9 +34,15 @@ IF(WITH_TESTING)
            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
    ENDIF(WIN32)

+    IF(WITH_MKLML)
+        # wait for mklml downloading completed
+        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
+    ENDIF()
+
    ExternalProject_Add(
        extern_gtest
        ${EXTERNAL_PROJECT_LOG_ARGS}
+        DEPENDS         ${GTEST_DEPENDS}
        GIT_REPOSITORY  "https://github.com/google/googletest.git"
        GIT_TAG         "release-1.8.0"
        PREFIX          ${GTEST_SOURCES_DIR}
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -0,0 +1,72 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLDNN})
+  return()
+ENDIF(NOT ${WITH_MKLDNN})
+
+INCLUDE(ExternalProject)
+
+SET(MKLDNN_PROJECT        "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
+
+SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
+SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+
+IF(WIN32)
+    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
+      "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF)
+    return()
+ELSE(WIN32)
+    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
+    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+
+IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
+    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
+    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+ENDIF()
+
+ExternalProject_Add(
+    ${MKLDNN_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLDNN_DEPENDS}
+    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG             "v0.9"
+    PREFIX              ${MKLDNN_SOURCES_DIR}
+    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
+    BUILD_COMMAND       cd <SOURCE_DIR>/build
+                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
+                        && $(MAKE)
+    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
+    UPDATE_COMMAND      ""
+)
+
+ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
+LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -0,0 +1,64 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLML})
+  return()
+ENDIF(NOT ${WITH_MKLML})
+
+INCLUDE(ExternalProject)
+
+SET(MKLML_PROJECT       "extern_mklml")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
+SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
+SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
+
+SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
+SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
+SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
+SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
+FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
+                              "cmake_minimum_required(VERSION 3.0)\n"
+                              "install(DIRECTORY ${MKLML_VER}\n"
+                              "        DESTINATION ${MKLML_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${MKLML_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${MKLML_SOURCE_DIR}
+    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
+                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} 
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
+ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+LIST(APPEND external_project_dependencies mklml)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -124,6 +124,7 @@ set(GPU_COMMON_FLAGS
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs
    -Wno-error=unused-function  # Warnings in Numpy Header.
+    -Wno-error=array-bounds # Warnings in Eigen::array
 )

 if (APPLE)
--- a/doc/design/cluster_train/save_model.md
+++ b/doc/design/cluster_train/save_model.md
@ -75,10 +75,11 @@ snapshot to a model will be a TODO for future.
 ### Trainer Election

 One trainer will be elected as the one to save the model. When using
-etcd, trainer ID is a randomly generated UUID, we will utilize etcd to
-elect one trainer. When not using etcd, unique trainer IDs will be
-given by the administrator, the trainer whose ID is "0" is elected to
-save the model.
+etcd, trainer ID is a randomly generated UUID, the trainer will
+contact the master server requesting to save the model, and find out
+if itself is elected. When the master server is not used, unique
+trainer IDs will be given by the administrator, the trainer whose ID
+is "0" is elected to save the model.

 ### Model Save Path

--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@ -49,6 +49,7 @@ message AttrProto {
 message VarProto {
 	required string name = 1;
 	required string comment = 2;
+	required bool is_tensor = 3;
 };

 message OpProto {
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@ -311,3 +311,13 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。

 主要的解决办法是减小学习律或者对数据进行归一化处理。
+
+15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
+
+pip uninstall py_paddle paddle
+
+然后安装paddle的python环境, 在build目录下执行
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@ -59,7 +59,11 @@ func main() {

 		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
 		if err != nil {
-			log.Errorf("Fetch checkpoint failed, %s", err)
+			if err == pserver.ErrCheckpointNotFound {
+				log.Infof("Could not find the pserver checkpoint.")
+			} else {
+				log.Errorf("Fetch checkpoint failed, %s", err)
+			}
 		}
 	}

--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@ -22,6 +22,9 @@ package main
 #define PADDLE_MASTER_OK    0
 #define PADDLE_MASTER_ERROR -1

+#define PADDLE_SAVE_MODEL_OK   1
+#define PADDLE_SAVE_MODEL_SKIP 0
+
 typedef int paddle_master_client;
 */
 import "C"
@ -33,7 +36,6 @@ import (
 	"unsafe"

 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )

@ -65,32 +67,32 @@ func remove(client C.paddle_master_client) *master.Client {
 }

 //export paddle_new_etcd_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
 	p := C.GoString(etcdEndpoints)
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   strings.Split(p, ","),
-		DialTimeout: time.Second * time.Duration(timeout),
-	})
+	endpoints := strings.Split(p, ",")
+	c, err := master.NewClient(
+		master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
+		master.WithBuffer(bufSize),
+	)
 	if err != nil {
 		panic(err)
 	}
-	ch := make(chan string, 1)
-	a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
-	if err != nil {
-		panic(err)
-	}
-	ch <- a
-	go master.WatchKey(cli, master.DefaultAddrPath, ch)
-	c := master.NewClient(ch, bufSize)
+
 	return add(c)
 }

 //export paddle_new_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
 	a := C.GoString(addr)
-	ch := make(chan string, 1)
-	ch <- a
-	c := master.NewClient(ch, bufSize)
+	c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
+	if err != nil {
+		panic(err)
+	}
+
 	return add(c)
 }

@ -117,9 +119,10 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }

-// return value:
-//     0:ok
-//    -1:error
+// paddle_next_record gets the nexts training record.
+//
+// returns number of bytes of the records if success, -1 if failed.
+//
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
@ -143,6 +146,29 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	return C.int(size)
 }

+// paddle_request_save_model requests the master server to approve the
+// caller to save the model.
+//
+// returns 1 if the save the model request is approved, 0 if the
+// request is rejected because other trainer is saving the model, -1
+// if error happened.
+//
+//export paddle_request_save_model
+func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
+	c := get(client)
+	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
+	if err != nil {
+		log.Errorln(err)
+		return C.PADDLE_MASTER_ERROR
+	}
+
+	if need {
+		return C.PADDLE_SAVE_MODEL_OK
+	}
+
+	return C.PADDLE_SAVE_MODEL_SKIP
+}
+
 //export mem_free
 func mem_free(p unsafe.Pointer) {
 	// "free" may be a better name for this function, but doing so
--- a/go/master/client.go
+++ b/go/master/client.go
@ -16,17 +16,20 @@ package master

 import (
 	"os"
+	"sync"
 	"time"

 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
+	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )

 // Client is the client of the master server.
 type Client struct {
-	conn *connection.Conn
-	ch   chan record
+	conn       *connection.Conn
+	ch         chan record
+	initChOnce sync.Once
 }

 type record struct {
@ -34,24 +37,83 @@ type record struct {
 	err error
 }

-// NewClient creates a new Client.
+// WithBuffer sets the client to buffer the training record.
 //
 // bufSize is the record buffer size. NextRecord will read from this
 // buffer.
-func NewClient(addrCh <-chan string, bufSize int) *Client {
+func WithBuffer(bufSize int) func(*Client) error {
+	return func(c *Client) error {
+		if bufSize <= 0 {
+			return nil
+		}
+
+		c.initChOnce.Do(func() {
+			c.ch = make(chan record, bufSize)
+			go c.getRecords()
+		})
+		return nil
+	}
+}
+
+// WithAddr sets the client to use fixed master address.
+func WithAddr(addr string) func(c *Client) error {
+	return func(c *Client) error {
+		ch := make(chan string, 1)
+		ch <- addr
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// WithEtcd sets the client to use etcd for master discovery.
+func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
+	return func(c *Client) error {
+		cli, err := clientv3.New(clientv3.Config{
+			Endpoints:   endpoints,
+			DialTimeout: timeout,
+		})
+		if err != nil {
+			return err
+		}
+
+		ch := make(chan string, 1)
+		a, err := GetKey(cli, DefaultAddrPath, timeout)
+		if err != nil {
+			return err
+		}
+
+		if a != "" {
+			// Master is registered, send to the master address
+			// channel.
+			ch <- a
+		}
+
+		go watchKey(cli, DefaultAddrPath, ch)
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// NewClient creates a new Client.
+func NewClient(opts ...func(*Client) error) (*Client, error) {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan record, bufSize)
-	go c.monitorMaster(addrCh)
-	go c.getRecords()
-	return c
+
+	for _, opt := range opts {
+		err := opt(c)
+		if err != nil {
+			return nil, err
+		}
+
+	}
+
+	return c, nil
 }

 func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// getTask call.
 			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
 			time.Sleep(3 * time.Second)
 			continue
@ -146,6 +208,20 @@ func (c *Client) taskFailed(meta TaskMeta) error {
 // NextRecord will block until the next record is available. It is
 // thread-safe.
 func (c *Client) NextRecord() ([]byte, error) {
+	c.initChOnce.Do(func() {
+		// initialize with in case WithBuffer is not used.
+		c.ch = make(chan record, 0)
+		go c.getRecords()
+	})
+
 	r := <-c.ch
 	return r.r, r.err
 }
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
+	var need bool
+	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
+	return need, err
+}
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@ -87,9 +87,11 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}

-	curAddr := make(chan string, 1)
-	curAddr <- fmt.Sprintf(":%d", p)
-	c := master.NewClient(curAddr, 10)
+	c, err := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(10))
+	if err != nil {
+		panic(err)
+	}
+
 	err = c.SetDataset([]string{path})
 	if err != nil {
 		panic(err)
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@ -158,8 +158,8 @@ func (e *EtcdClient) Load() ([]byte, error) {
 }

 // GetKey gets the value by the specify key.
-func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	resp, err := c.Get(ctx, key)
 	cancel()
 	if err != nil {
@ -173,8 +173,8 @@ func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
 	return string(v), nil
 }

-// WatchKey watches the specify key and send to valChan if there is some event.
-func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
+// watchKey watches the specify key and send to valChan if there is some event.
+func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
 	rch := c.Watch(context.Background(), key)
 	for wresp := range rch {
 		for _, ev := range wresp.Events {
--- a/go/master/service.go
+++ b/go/master/service.go
@ -78,9 +78,10 @@ type Service struct {
 	ready         chan struct{}
 	store         Store

-	mu         sync.Mutex
-	initDone   bool
-	taskQueues taskQueues
+	mu            sync.Mutex
+	initDone      bool
+	taskQueues    taskQueues
+	savingTrainer string
 }

 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
@ -246,7 +247,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 //
 // SetDataset can be call multiple times. But only the first call will
 // be honored.
-func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+func (s *Service) SetDataset(globPaths []string, _ *int) error {
 	if len(globPaths) == 0 {
 		return errors.New("no dataset specified")
 	}
@ -330,7 +331,7 @@ func (s *Service) logFields() log.Fields {
 }

 // GetTask gets a new task from the service.
-func (s *Service) GetTask(dummy int, task *Task) error {
+func (s *Service) GetTask(_ int, task *Task) error {
 	select {
 	case <-s.ready:
 	}
@ -380,7 +381,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 }

 // TaskFinished tell the service that a task is finished.
-func (s *Service) TaskFinished(taskID int, dummy *int) error {
+func (s *Service) TaskFinished(taskID int, _ *int) error {
 	select {
 	case <-s.ready:
 	}
@ -415,7 +416,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 }

 // TaskFailed tells the service that a task is failed.
-func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+func (s *Service) TaskFailed(meta TaskMeta, _ *int) error {
 	select {
 	case <-s.ready:
 	}
@ -432,3 +433,42 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	s.processFailedTask(t, meta.Epoch)
 	return nil
 }
+
+// SaveModelRequest is the request for saving model
+type SaveModelRequest struct {
+	TrainerID string
+	BlockDur  time.Duration
+}
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if req.TrainerID == "" {
+		return errors.New("trainer id is empty")
+	}
+
+	if s.savingTrainer == "" {
+		*need = true
+	} else {
+		if req.TrainerID == s.savingTrainer {
+			// save trainer asked to save model again
+			*need = true
+		} else {
+			*need = false
+		}
+	}
+
+	if *need {
+		s.savingTrainer = req.TrainerID
+		time.AfterFunc(req.BlockDur, func() {
+			s.mu.Lock()
+			s.savingTrainer = ""
+			s.mu.Unlock()
+		})
+	}
+
+	return nil
+}
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@ -127,13 +127,19 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }

+// paddle_begin_init_params tells trainer if it needs to init the
+// parameters.
+//
+// returns 1 if the trainer needs to init the parameters. 0 if the
+// trainer does not need to init the parameters.
+//
 //export paddle_begin_init_params
 func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return C.PSERVER_OK
+	return 0
 }

 //export paddle_init_param
@ -256,17 +262,4 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 	return C.PSERVER_OK
 }

-//export paddle_save_model
-func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
-	p := C.GoString(path)
-	c := get(client)
-	err := c.Save(p)
-	if err != nil {
-		log.Errorln(err)
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
 func main() {} // Required but ignored
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
@ -111,9 +111,5 @@ retry:
    getParams(c);
  }

-  if (paddle_save_model(c, "/tmp/")) {
-    fail();
-  }
-
  return 0;
 }
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@ -219,32 +219,6 @@ func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
 	return ps, nil
 }

-// Save indicates parameters to save the parameter to the given path.
-func (c *Client) Save(path string) error {
-	errCh := make(chan error, len(c.pservers))
-
-	for _, p := range c.pservers {
-		err := p.Call("Service.Save", path, nil)
-		errCh <- err
-	}
-
-	recv := 0
-	for err := range errCh {
-		if err != nil {
-			return err
-		}
-
-		recv++
-		if recv == len(c.pservers) {
-			break
-		}
-	}
-
-	// TODO(helin): there will be many files under path, need to
-	// merge them into a single file.
-	return nil
-}
-
 func strHash(s string) uint32 {
 	h := fnv.New32a()
 	_, _ = h.Write([]byte(s))
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@ -36,6 +36,10 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int

+// ErrCheckpointNotFound indicates that the pserver checkpoint could
+// not be found.
+var ErrCheckpointNotFound = errors.New("checkpoint not found")
+
 // RPC error message.
 const (
 	AlreadyInitialized  = "pserver already initialized"
@ -103,6 +107,10 @@ func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, e
 		return nil, err
 	}

+	if len(v) == 0 {
+		return nil, ErrCheckpointNotFound
+	}
+
 	var cpMeta checkpointMeta
 	if err = json.Unmarshal(v, &cpMeta); err != nil {
 		return nil, err
@ -156,7 +164,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
 }

 // InitParam initializes a parameter.
-func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
+func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@ -177,7 +185,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er

 // FinishInitParams tells the parameter server that the parameter
 // initialization has finished.
-func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
+func (s *Service) FinishInitParams(_ int, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@ -190,7 +198,7 @@ func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {

 // SendGrad sends gradient to parameter servers for parameter
 // optimization.
-func (s *Service) SendGrad(g Gradient, dummy *int) error {
+func (s *Service) SendGrad(g Gradient, _ *int) error {
 	select {
 	case <-s.initialized:
 	default:
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@ -330,7 +330,7 @@ __global__ void KeSequenceAvgForward(real* dst,
    }
    sum = mode == 1 ? sum :
        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[gid] = sum;
+    dst[gid] += sum;
  }
 }

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -19,8 +19,10 @@ cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

-cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
+cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
+cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)

 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@ -28,5 +30,6 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)

 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
+# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
 cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@ -61,25 +61,24 @@ struct EigenTensor {
  }
 };

+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten is to reshape a Tensor into a one dimension EigenVector
-  using Parent = EigenTensor<T, 1, MajorType, IndexType>;
-  static typename Parent::Type Flatten(Tensor& tensor) {
-    return Parent::From(tensor,
-                        make_ddim({static_cast<int>(product(tensor.dims_))}));
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }

-  static typename Parent::ConstType Flatten(const Tensor& tensor) {
-    return Parent::From(tensor,
-                        make_ddim({static_cast<int>(product(tensor.dims_))}));
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
 };

-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = EigenTensor<T, 2, MajorType, IndexType>;
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+OperatorBase* GradOpBuilder::Build() {
+  BuildOpInOutArgList();
+  std::string grad_op_type = OpRegistry::grad_ops().at(op_->type_);
+  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
+  grad_op->type_ = grad_op_type;
+  CompleteGradOp(grad_op);
+  return grad_op;
+}
+
+OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
+                                    const VarIndexMap& var_map,
+                                    const std::vector<int>& format,
+                                    InOutType type) {
+  int idx = var_map.at(var.name());
+  int begin_idx = format.empty() ? idx : format.at(idx);
+  int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
+  return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx,
+                        end_idx);
+}
+
+void GradOpBuilder::BuildOpInOutArgList() {
+  const OpProto& op_proto = OpRegistry::protos().at(op_->type_);
+  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_));
+  const std::vector<int>& in_format =
+      op_->attrs_.count("input_format")
+          ? op_->GetAttr<std::vector<int>>("input_format")
+          : std::vector<int>();
+  const std::vector<int>& out_format =
+      op_->attrs_.count("output_format")
+          ? op_->GetAttr<std::vector<int>>("output_format")
+          : std::vector<int>();
+  for (const auto& var : op_proto.inputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, in_format, IN)));
+  }
+  for (const auto& var : op_proto.outputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, out_format, OUT)));
+  }
+}
+
+void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
+                                     std::vector<std::string>& in_out,
+                                     std::vector<int>& format,
+                                     VarIndexMap* varmap, int& idx,
+                                     bool is_grad) const {
+  std::string var_name = arg->proto_name_;
+  if (is_grad) {
+    var_name += OperatorBase::GRAD_VAR_SUFFIX();
+  }
+  (*varmap)[var_name] = idx++;
+  size_t pre_sz = in_out.size();
+  auto base_it =
+      arg->type_ == IN ? op_->inputs_.begin() : op_->outputs_.begin();
+  std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
+            std::back_inserter(in_out));
+  if (is_grad) {
+    for (size_t i = pre_sz; i < in_out.size(); ++i) {
+      in_out[i] += OperatorBase::GRAD_VAR_SUFFIX();
+    }
+  }
+  format.push_back(in_out.size());
+}
+
+void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
+  grad_op->attrs_ = op_->attrs_;
+  grad_op->attrs_.erase("input_format");
+  grad_op->attrs_.erase("output_format");
+  VarIndexMap* grad_varmap = new VarIndexMap();
+  int in_idx = 0;
+  int out_idx = 0;
+  std::vector<int> in_format({0});
+  std::vector<int> out_format({0});
+  for (const auto& arg : arg_list_) {
+    // op_'s inputs_ and outputs_
+    if (arg->needed_in_grad_) {
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, false);
+    }
+    if (arg->type_ == IN) {
+      // gradients of op_'s inputs_
+      AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
+                       out_idx, true);
+    } else {
+      // gradients of op_'s outputs_
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, true);
+    }
+  }
+  grad_op->attrs_["input_format"] = in_format;
+  grad_op->attrs_["output_format"] = out_format;
+  grad_op->in_out_idxs_.reset(grad_varmap);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@ -0,0 +1,48 @@
+#pragma once
+
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class OpRegistry;
+
+enum InOutType { IN, OUT };
+
+struct OpInOutArg {
+  OpInOutArg(const std::string& proto_name, const InOutType& type,
+             bool needed_in_grad, size_t begin_idx, size_t end_idx)
+      : proto_name_(proto_name),
+        type_(type),
+        needed_in_grad_(needed_in_grad),
+        begin_idx_(begin_idx),
+        end_idx_(end_idx) {}
+
+  std::string proto_name_;
+  InOutType type_;
+  bool needed_in_grad_;
+  size_t begin_idx_;
+  size_t end_idx_;
+};
+
+class GradOpBuilder {
+  using VarIndexMap = std::unordered_map<std::string, int>;
+
+ public:
+  GradOpBuilder(const OperatorBase* op) : op_(op) {}
+  OperatorBase* Build();
+
+ private:
+  OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
+                       const std::vector<int>& format, InOutType type);
+  void BuildOpInOutArgList();
+  void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
+                        std::vector<int>& format, VarIndexMap* varmap, int& idx,
+                        bool is_grad) const;
+  void CompleteGradOp(OperatorBase* grad_op) const;
+  const OperatorBase* op_;
+  std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/Show More
+++ b/Show More