Merge branch 'develop' into doc

8 years ago · 5b81f5da74
parent 463c68e6f3 8c2a0a763f
commit 5b81f5da74
68 changed files with 812 additions and 715 deletions
--- a/9
+++ b/9
@ -38,17 +38,16 @@ RUN apt-get update && \
 RUN pip --no-cache-dir install 'numpy>=1.12.0'

 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C /usr/local -xzf go.tgz && \
+RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
    mkdir /root/gopath && \
    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
-    rm go.tgz
+    mkdir /root/gopath/src
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
-RUN curl -q https://glide.sh/get | sh
+RUN curl -s -q https://glide.sh/get | sh

 # git credential to skip password typing
 RUN git config --global credential.helper store
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@ -8,7 +8,7 @@ ExternalProject_Add(
    extern_lib_any
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/PaddlePaddle/any.git"
-    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
+    GIT_TAG         "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
    PREFIX          ${ANY_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -17,7 +17,7 @@ IF(NOT ${WITH_MKLML})
 ENDIF(NOT ${WITH_MKLML})

 IF(WIN32 OR APPLE)
-    MESSAGE(WARNING 
+    MESSAGE(WARNING
        "Windows or Mac is not supported with MKLML in Paddle yet."
        "Force WITH_MKLML=OFF")
    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
@ -43,22 +43,21 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")

 INCLUDE_DIRECTORIES(${MKLML_INC_DIR})

-SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
-FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
-                              "cmake_minimum_required(VERSION 3.0)\n"
-                              "install(DIRECTORY ${MKLML_VER}\n"
-                              "        DESTINATION ${MKLML_DST_DIR})\n")
+FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(MKLML)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${MKLML_VER}\n"
+  "        DESTINATION ${MKLML_DST_DIR})\n")

 ExternalProject_Add(
    ${MKLML_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    PREFIX                ${MKLML_SOURCE_DIR}
    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
-                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
+    DOWNLOAD_COMMAND      wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
    DOWNLOAD_NO_PROGRESS  1
    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} 
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
 )

--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@ -11,6 +11,15 @@ Paddle每次发新的版本，遵循以下流程:
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
 		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
+	* 编译这个版本的python wheel包，并发布到pypi。
+		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
+		* 上传方法：
+			```
+			cd build/python
+			pip install twine
+			twine upload dist/[package to upload]
+			```
 4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
 5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
 6. 协同完成Release Note的书写
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@ -3,6 +3,43 @@ PaddlePaddle的Docker容器使用方式

 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。

+Docker使用入门
+------------------------------
+
+几个基础的概念帮助理解和使用Docker：
+
+- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+
+  .. code-block:: bash
+
+     docker images
+
+  来列出当前系统中的所有镜像，同样可以执行：
+
+  .. code-block:: bash
+		  
+     docker pull paddlepaddle/paddle:0.10.0
+
+  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
+
+- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
+  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
+  可以执行：
+
+  .. code-block:: bash
+
+     docker run paddlepaddle/paddle:0.10.0
+
+  来使用一个镜像启动一个容器。
+
+- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
+  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
+  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
+  debian镜像，并且启动后执行 :code:`ls /data`。
+
+  .. code-block:: bash
+
+     docker run --rm -v $(pwd):/data debian ls /data

 PaddlePaddle发布的Docker镜像使用说明
 ------------------------------
@ -12,11 +49,11 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打
 像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
 PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
 行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国
-内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您
-在国内，请把文档里命令中的paddlepaddle/paddle替换成
-docker.paddlepaddle.org/paddle。
+`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
+和国内镜像`docker.paddlepaddle.org` 提供最新
+的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
+
+**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**

 1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`

@ -68,6 +105,8 @@ docker.paddlepaddle.org/paddle。

   如果输出是No，就需要选择使用no-AVX的镜像

+   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
+
   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@ -63,12 +63,35 @@ CPU-only version and a CUDA GPU version and their no-AVX versions.

 We put the docker images on `dockerhub.com
 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. If you are in
-China, you can use our Docker image registry mirror to speed up the
-download process. To use it, please replace all paddlepaddle/paddle in
-the commands to docker.paddlepaddle.org/paddle.
+latest versions under "tags" tab at dockerhub.com. 

-1. Production images, this image might have multiple variants:
+** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
+
+
+1. development image :code:`paddlepaddle/paddle:<version>-dev`
+
+   This image has packed related develop tools and runtime
+   environment. Users and developers can use this image instead of
+   their own local computer to accomplish development, build,
+   releasing, document writing etc. While different version of paddle
+   may depends on different version of libraries and tools, if you
+   want to setup a local environment, you must pay attention to the
+   versions.  The development image contains:
+   
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+     
+   Many developers use servers with GPUs, they can use ssh to login to
+   the server and run :code:`docker exec` to enter the docker
+   container and start their work.  Also they can start a development
+   docker image with SSHD service, so they can login to the container
+   and start work.
+
+2. Production images, this image might have multiple variants:

   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
@ -84,7 +107,7 @@ the commands to docker.paddlepaddle.org/paddle.

      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi

-   
+   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
   To run the CPU-only image as an interactive container:

   .. code-block:: bash
@ -103,29 +126,6 @@ the commands to docker.paddlepaddle.org/paddle.

      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash

-2. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-

 Train Model Using Python API
 ----------------------------
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@ -32,7 +32,7 @@ import (

 func main() {
 	port := flag.Int("port", 0, "port of the pserver")
-	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
+	index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
 	dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
@ -60,12 +60,12 @@ func main() {
 		idx, err = e.Register(*port)
 		candy.Must(err)

-		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
+		cp, err = pserver.LoadCheckpoint(e, idx)
 		if err != nil {
 			if err == pserver.ErrCheckpointNotFound {
 				log.Infof("Could not find the pserver checkpoint.")
 			} else {
-				log.Errorf("Fetch checkpoint failed, %s", err)
+				panic(err)
 			}
 		}
 	}
--- a/go/glide.lock
+++ b/go/glide.lock
@ -1,5 +1,5 @@
-hash: 2a1c0eca5c07a130e3d224f9821f96cfa37a39bf6bce141c855bbc57ef569f1c
-updated: 2017-07-29T07:34:48.722757905+08:00
+hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
+updated: 2017-08-03T21:46:51.744995189Z
 imports:
 - name: github.com/beorn7/perks
  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
@ -145,6 +145,8 @@ imports:
  version: a1dba9ce8baed984a2495b658c82687f8157b98f
  subpackages:
  - xfs
+- name: github.com/satori/go.uuid
+  version: 879c5887cd475cd7864858769793b2ceb0d44feb
 - name: github.com/sirupsen/logrus
  version: a3f95b5c423586578a4e099b11a46c2479628cac
 - name: github.com/topicai/candy
--- a/go/glide.yaml
+++ b/go/glide.yaml
@ -14,11 +14,13 @@ import:
  version: ^1.0.0
 - package: github.com/topicai/candy
 - package: golang.org/x/crypto
-  vcs: git
  repo: https://github.com/golang/crypto.git
- package: golang.org/x/sys
  vcs: git
+- package: golang.org/x/sys
  repo: https://github.com/golang/sys.git
- package: golang.org/x/text
  vcs: git
+- package: golang.org/x/text
  repo: https://github.com/golang/text.git
+  vcs: git
+- package: github.com/satori/go.uuid
+  version: v1.1.0
--- a/go/master/service.go
+++ b/go/master/service.go
@ -77,11 +77,12 @@ type taskEntry struct {
 	NumFailure int
 }

-type taskQueues struct {
+type masterState struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
 	Failed  []taskEntry
+	CurPass int
 }

 // Service is the master server service.
@ -94,11 +95,11 @@ type Service struct {
 	ready    chan struct{}
 	initDone bool

-	mu         sync.Mutex
-	taskQueues taskQueues
-	currPass   int
-	jobTasks   []taskEntry
-
+	mu sync.Mutex
+	// State to be persisted to snapshot.
+	state masterState
+	// The trainer that is currently saving model. This state is
+	// transient, does not need to be persisted to snapshot.
 	savingTrainer string
 }

@ -141,8 +142,8 @@ func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failur
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
 	s.failureMax = failureMax
-	s.taskQueues = taskQueues{}
-	s.taskQueues.Pending = make(map[int]taskEntry)
+	s.state = masterState{}
+	s.state.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
 	s.store = store
 	recovered, err := s.recover()
@ -180,7 +181,7 @@ func (s *Service) recover() (bool, error) {
 	}

 	dec := gob.NewDecoder(gr)
-	var tqs taskQueues
+	var tqs masterState
 	err = dec.Decode(&tqs)
 	if err != nil {
 		return false, err
@ -193,7 +194,12 @@ func (s *Service) recover() (bool, error) {
 		log.Errorln(err)
 	}

-	s.taskQueues = tqs
+	s.state = tqs
+	log.WithFields(s.logFields()).Infof("Master recovered from snapshot, scheduling pending task timeout check.")
+	for _, t := range s.state.Pending {
+		time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
+	}
+
 	return true, nil
 }

@ -208,7 +214,7 @@ func (s *Service) snapshot() error {
 	var buf bytes.Buffer
 	gw := gzip.NewWriter(&buf)
 	enc := gob.NewEncoder(gw)
-	err := enc.Encode(s.taskQueues)
+	err := enc.Encode(s.state)
 	if err != nil {
 		return err
 	}
@ -290,8 +296,7 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error {
 		return err
 	}

-	s.jobTasks = partition(chunks, s.chunksPerTask)
-	s.taskQueues.Todo = s.jobTasks
+	s.state.Todo = partition(chunks, s.chunksPerTask)

 	err = s.snapshot()
 	if err != nil {
@ -319,17 +324,17 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 		}
 	}()

-	delete(s.taskQueues.Pending, t.Task.Meta.ID)
+	delete(s.state.Pending, t.Task.Meta.ID)

 	t.NumFailure++
 	if t.NumFailure > s.failureMax {
 		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
-		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
+		s.state.Failed = append(s.state.Failed, t)
 		return
 	}

 	log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure)
-	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	s.state.Todo = append(s.state.Todo, t)
 	return
 }

@ -338,7 +343,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 		s.mu.Lock()
 		defer s.mu.Unlock()

-		t, ok := s.taskQueues.Pending[taskID]
+		t, ok := s.state.Pending[taskID]
 		if !ok {
 			return
 		}
@ -350,10 +355,11 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 // must be called with lock held.
 func (s *Service) logFields() log.Fields {
 	return log.Fields{
-		"todoLen":    len(s.taskQueues.Todo),
-		"pendingLen": len(s.taskQueues.Pending),
-		"doneLen":    len(s.taskQueues.Done),
-		"failedLen":  len(s.taskQueues.Failed),
+		"todoLen":    len(s.state.Todo),
+		"pendingLen": len(s.state.Pending),
+		"doneLen":    len(s.state.Done),
+		"failedLen":  len(s.state.Failed),
+		"curPass":    s.state.CurPass,
 	}
 }

@ -366,17 +372,17 @@ func (s *Service) GetTask(passID int, task *Task) error {

 	s.mu.Lock()
 	defer s.mu.Unlock()
-	if passID < s.currPass {
+	if passID < s.state.CurPass {
 		return ErrPassBefore
 	}
-	if passID > s.currPass {
+	if passID > s.state.CurPass {
 		// Client may get run to pass after master when one client faster than the
 		// other
 		return ErrPassAfter
 	}

-	if len(s.taskQueues.Todo) == 0 {
-		if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 {
+	if len(s.state.Todo) == 0 {
+		if len(s.state.Done) == 0 && len(s.state.Pending) == 0 {
 			log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass")
 			return ErrAllTaskFailed
 		}
@ -384,10 +390,10 @@ func (s *Service) GetTask(passID int, task *Task) error {
 		return ErrNoMoreAvailable
 	}

-	t := s.taskQueues.Todo[0]
+	t := s.state.Todo[0]
 	t.Task.Meta.Epoch++
-	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.Meta.ID] = t
+	s.state.Todo = s.state.Todo[1:]
+	s.state.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
@ -409,7 +415,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()

-	t, ok := s.taskQueues.Pending[taskID]
+	t, ok := s.state.Pending[taskID]
 	if !ok {
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
 		return nil
@ -417,18 +423,18 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {

 	// task finished, reset timeout
 	t.NumFailure = 0
-	s.taskQueues.Done = append(s.taskQueues.Done, t)
-	delete(s.taskQueues.Pending, taskID)
+	s.state.Done = append(s.state.Done, t)
+	delete(s.state.Pending, taskID)

 	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
-	if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 {
+	if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 {
 		// increase master side pass count if all tasks finished
-		s.currPass++
-		s.taskQueues.Todo = s.jobTasks
-		s.taskQueues.Done = []taskEntry{}
+		s.state.CurPass++
+		s.state.Todo = append(s.state.Done, s.state.Failed...)
+		s.state.Done = []taskEntry{}
 		// TODO(typhoonzero): deal with failed tasks
-		s.taskQueues.Failed = []taskEntry{}
-		log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass)
+		s.state.Failed = []taskEntry{}
+		log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.state.CurPass)
 	}

 	err := s.snapshot()
@ -447,7 +453,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()

-	t, ok := s.taskQueues.Pending[meta.ID]
+	t, ok := s.state.Pending[meta.ID]
 	if !ok {
 		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
 		return nil
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@ -59,7 +59,7 @@ func initClient() [numPserver]int {

 		go func(l net.Listener) {
 			var cp pserver.Checkpoint
-			s, err := pserver.NewService(0, 1, "", nil, cp)
+			s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 			if err != nil {
 				panic(err)
 			}
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@ -103,7 +103,7 @@ func (p *EtcdClient) List() []Server {
 				time.Sleep(p.timeout)
 				continue
 			}
-			log.Infof("got value (%s) for key: %s", psAddr, psKey)
+			log.Debugf("got value (%s) for key: %s", psAddr, psKey)
 			servers[i].Index = i
 			servers[i].Addr = psAddr
 		}
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@ -206,6 +206,7 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
 	if err != nil {
 		return []byte{}, err
 	}
+
 	kvs := resp.Kvs
 	if len(kvs) == 0 {
 		return []byte{}, nil
@ -215,9 +216,14 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
 }

 // PutKey put into etcd with value by key specified
-func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	_, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
+	var err error
+	if withLease {
+		_, err = e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
+	} else {
+		_, err = e.client.Put(ctx, key, string(value))
+	}
 	cancel()
 	return err
 }
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@ -32,6 +32,7 @@ type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
 	contentLen  int
+	config      []byte
 }

 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
@ -70,6 +71,7 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer
 		cstate = unsafe.Pointer(&s[0])
 	}

+	o.config = c
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
 		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
 	return o
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@ -25,11 +25,13 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
-	"path/filepath"
+	"path"
 	"strconv"
 	"sync"
 	"time"

+	uuid "github.com/satori/go.uuid"
+
 	log "github.com/sirupsen/logrus"
 )

@ -42,9 +44,9 @@ var ErrCheckpointNotFound = errors.New("checkpoint not found")

 // RPC error message.
 const (
-	AlreadyInitialized  = "pserver already initialized"
-	Uninitialized       = "pserver not fully initialized"
-	CheckpointMD5Failed = "checkpoint file MD5 validation failed"
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+	WrongChecksum      = "checkpoint file checksum validation failed"
 )

 // Supported element types.
@ -73,11 +75,12 @@ type ParameterWithConfig struct {
 // checkpointMeta saves checkpoint metadata
 type checkpointMeta struct {
 	UUID      string `json:"uuid"`
+	Path      string `json:"path"`
 	MD5       string `json:"md5"`
 	Timestamp int64  `json:"timestamp"`
 }

-// Checkpoint is the pserver shard persist in file
+// Checkpoint is the pserver shard persist in file.
 type Checkpoint []parameterCheckpoint

 // Gradient is the gradient of the parameter.
@ -90,50 +93,58 @@ type Service struct {
 	checkpointInterval time.Duration
 	checkpointPath     string
 	client             *EtcdClient
-	mu                 sync.Mutex
-	optMap             map[string]*optimizer
+
+	mu     sync.Mutex
+	optMap map[string]*optimizer
 }

-// parameterCheckpoint saves parameter checkpoint
+// parameterCheckpoint saves parameter checkpoint.
 type parameterCheckpoint struct {
 	ParameterWithConfig
 	State []byte
 }

-// NewCheckpointFromFile loads parameters and state from checkpoint file
-func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) {
-	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
+func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
+	v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second)
 	if err != nil {
-		return nil, err
+		return
 	}

 	if len(v) == 0 {
-		return nil, ErrCheckpointNotFound
+		err = ErrCheckpointNotFound
+		return
 	}

-	var cpMeta checkpointMeta
-	if err = json.Unmarshal(v, &cpMeta); err != nil {
-		return nil, err
+	if err = json.Unmarshal(v, &meta); err != nil {
+		return
 	}

-	fn := filepath.Join(cpPath, cpMeta.UUID)
-	if _, err = os.Stat(fn); os.IsNotExist(err) {
+	return
+}
+
+// LoadCheckpoint loads checkpoint from file.
+func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
+	cpMeta, err := loadMeta(e, idx)
+	if err != nil {
 		return nil, err
 	}
-	content, err := ioutil.ReadFile(fn)
+
+	content, err := ioutil.ReadFile(cpMeta.Path)
 	if err != nil {
 		return nil, err
 	}

+	// TODO(helin): change MD5 to CRC since CRC is better for file
+	// checksum in our use case (emphasize speed over security).
 	h := md5.New()
 	md5 := hex.EncodeToString(h.Sum(content))
 	if md5 != cpMeta.MD5 {
-		return nil, errors.New(CheckpointMD5Failed)
+		return nil, errors.New(WrongChecksum)
 	}

 	dec := gob.NewDecoder(bytes.NewReader(content))
-	cp := Checkpoint{}
-	if err = dec.Decode(cp); err != nil {
+	var cp Checkpoint
+	if err = dec.Decode(&cp); err != nil {
 		return nil, err
 	}
 	return cp, nil
@ -193,6 +204,15 @@ func (s *Service) FinishInitParams(_ int, _ *int) error {
 	}

 	close(s.initialized)
+	go func() {
+		t := time.Tick(s.checkpointInterval)
+		for range t {
+			err := s.checkpoint()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}
+	}()
 	return nil
 }

@ -240,23 +260,36 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	return nil
 }

-// pserver save checkpoint
-func (s *Service) doCheckpoint() (err error) {
-	<-s.initialized
-	s.mu.Lock()
-	defer s.mu.Unlock()
+func traceTime(start time.Time, name string) {
+	elapsed := time.Since(start)
+	log.Infof("%s took %v", name, elapsed)
+}
+
+// checkpoint saves checkpoint to disk.
+//
+// checkpoint should be only called after the parameters are
+// initialized.
+func (s *Service) checkpoint() (err error) {
+	log.Infoln("Begin save checkpoint.")
+	defer traceTime(time.Now(), "save checkpoint")

+	s.mu.Lock()
 	cp := make([]parameterCheckpoint, len(s.optMap))
 	index := 0
+	// TODO(helin): write checkpoint incrementally to reduce memory
+	// footprint during checkpoint.
 	for name, opt := range s.optMap {
 		var pc parameterCheckpoint
 		pc.Param.Name = name
 		pc.Param.ElementType = opt.elementType
 		pc.Param.Content = opt.GetWeights()
+		pc.Config = opt.config
 		pc.State = opt.GetStates()
 		cp[index] = pc
 		index++
 	}
+	s.mu.Unlock()
+
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
 	err = encoder.Encode(cp)
@ -264,32 +297,9 @@ func (s *Service) doCheckpoint() (err error) {
 		return
 	}

-	cpMeta := checkpointMeta{}
-	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
-	cpMeta.Timestamp = time.Now().UnixNano()
-	h := md5.New()
-	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
-
-	cpMetajson, err := json.Marshal(cpMeta)
-	if err != nil {
-		return
-	}
-
-	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
-	if err != nil {
-		return
-	}
-	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
-		log.Info("checkpoint does not exists.")
-	} else {
-		err = os.Remove(cpMeta.UUID)
-		if err != nil {
-			log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
-		} else {
-			log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
-		}
-	}
-	f, err := os.Create(cpMeta.UUID)
+	id := uuid.NewV4().String()
+	p := path.Join(s.checkpointPath, id)
+	f, err := os.Create(p)
 	if err != nil {
 		return
 	}
@ -317,5 +327,43 @@ func (s *Service) doCheckpoint() (err error) {
 		return
 	}

+	oldMeta, err := loadMeta(s.client, s.idx)
+	if err == ErrCheckpointNotFound {
+		log.Infoln("Do not have existing checkpoint.")
+		err = nil
+	}
+
+	if err != nil {
+		return
+	}
+
+	h := md5.New()
+	md5 := hex.EncodeToString(h.Sum(buf.Bytes()))
+	cpMeta := checkpointMeta{
+		UUID:      id,
+		Timestamp: time.Now().UnixNano(),
+		MD5:       md5,
+		Path:      p,
+	}
+
+	json, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
+
+	err = s.client.PutKey(PsCheckpoint+strconv.Itoa(s.idx), json, 3*time.Second, false)
+	if err != nil {
+		return
+	}
+
+	if oldMeta.Path != "" {
+		rmErr := os.Remove(oldMeta.Path)
+		if rmErr != nil {
+			// log error, but still treat checkpoint as
+			// successful.
+			log.Errorln(rmErr)
+		}
+	}
+
 	return
 }
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@ -30,7 +30,7 @@ const (

 func TestServiceFull(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@ -102,7 +102,7 @@ func TestServiceFull(t *testing.T) {

 func TestMultipleInit(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	if err != nil {
 		t.Fatal(err)
 	}
@ -119,7 +119,7 @@ func TestMultipleInit(t *testing.T) {

 func TestUninitialized(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.Fatal(err)
@ -128,7 +128,7 @@ func TestUninitialized(t *testing.T) {

 func TestBlockUntilInitialized(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -22,7 +22,5 @@ if(WITH_C_API)
 endif()

 if(WITH_SWIG_PY)
-  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-          ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
  add_subdirectory(api)
 endif()
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@ -82,9 +82,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
-    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
-    COMMAND rm -rf py_paddle.egg-info build
+    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
    WORKING_DIRECTORY ${PROJ_ROOT}/paddle
    DEPENDS _swig_paddle
 )
@ -92,10 +90,6 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)

-install(DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/dist/
-    DESTINATION opt/paddle/share/wheels
-)
-
 if(WITH_TESTING)
    IF(NOT PY_PIP_FOUND)
        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
@ -108,7 +102,7 @@ if(WITH_TESTING)
            BUILD_COMMAND       ""
            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
            BUILD_IN_SOURCE     1
-            DEPENDS python setuptools python_api_wheel
+            #DEPENDS python setuptools python_api_wheel
        )
    ENDIF()
    add_subdirectory(test)
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@ -39,6 +39,7 @@ set(CUDA_CU_SOURCES
    src/hl_cuda_lstm.cu
    src/hl_top_k.cu
    src/hl_batch_transpose.cu
+    src/hl_batch_norm.cu
    src/hl_cuda_sequence.cu
    src/hl_table_apply.cu)

--- a/paddle/cuda/include/hl_batch_norm.h
+++ b/paddle/cuda/include/hl_batch_norm.h
@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_BATCH_NORM_H_
+#define HL_BATCH_NORM_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   batch norm inferece.
+ *
+ * @param[in]   input         input data.
+ * @param[out]  output        output data.
+ * @param[in]   scale         batch normalization scale parameter (in original
+ *                            paper scale is referred to as gamma).
+ * @param[in]   bias          batch normalization bias parameter (in original
+ *                            paper scale is referred to as beta).
+ * @param[in]   estimatedMean
+ * @param[in]   estimatedVar  The moving mean and variance
+ *                            accumulated during the training phase are passed
+ *                            as inputs here.
+ * @param[in]   epsilon       Epsilon value used in the batch
+ *                            normalization formula.
+ */
+extern void hl_batch_norm_cuda_inference(const real* input,
+                                         real* output,
+                                         const real* scale,
+                                         const real* bias,
+                                         const real* estimatedMean,
+                                         const real* estimatedVar,
+                                         const double epsilon,
+                                         size_t batchSize,
+                                         size_t channel,
+                                         size_t height,
+                                         size_t width);
+
+#endif  // HL_BATCH_NORM_H_
--- a/paddle/cuda/src/hl_batch_norm.cu
+++ b/paddle/cuda/src/hl_batch_norm.cu
@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_batch_norm.h"
+
+__global__ void batchNormInference(real* output,
+                                   const real* input,
+                                   const real* scale,
+                                   const real* bias,
+                                   const real* estimatedMean,
+                                   const real* estimatedVar,
+                                   const double epsilon,
+                                   size_t batchSize,
+                                   size_t channel,
+                                   size_t height,
+                                   size_t width) {
+  const int tid = threadIdx.x;
+  const int num = channel * height * width;
+  const int batch = blockIdx.x;
+  for (int i = tid; i < num; i += blockDim.x) {
+    const int c = i / (height * width);
+    const int id = batch * num + i;
+    real val = input[id] - estimatedMean[c];
+    val /= sqrt(estimatedVar[c] + epsilon);
+    val *= scale[c];
+    val += bias[c];
+    output[id] = val;
+  }
+}
+
+void hl_batch_norm_cuda_inference(const real* input,
+                                  real* output,
+                                  const real* scale,
+                                  const real* bias,
+                                  const real* estimatedMean,
+                                  const real* estimatedVar,
+                                  const double epsilon,
+                                  size_t batchSize,
+                                  size_t channel,
+                                  size_t height,
+                                  size_t width) {
+  batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
+                                                            input,
+                                                            scale,
+                                                            bias,
+                                                            estimatedMean,
+                                                            estimatedVar,
+                                                            epsilon,
+                                                            batchSize,
+                                                            channel,
+                                                            height,
+                                                            width);
+
+  CHECK_SYNC("hl_batch_norm_cuda_inference failed!");
+}
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@ -1023,14 +1023,6 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  real beta = 1.0f;
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;

-  int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size;
-  if (batch_size > 1024 && g_cudnn_lib_version < 6000) {
-    LOG(INFO) << " To process current batch data with size " << batch_size
-              << " (>1024), cudnnBatchNorm requires cuDNN version >= 6000."
-              << " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED,"
-              << " just recompile PaddlePaddle with cuDNN >= 6000, replacing"
-              << " current version " << g_cudnn_lib_version;
-  }
  CHECK_CUDNN(
      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
                                                       mode,
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -35,6 +35,8 @@ add_dependencies(framework_py_proto framework_py_proto_init)

 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
+
+if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
@ -43,4 +45,6 @@ cc_library(paddle_pybind SHARED
 	add_op
 	mean_op
 	cross_entropy_op
+	fill_zeros_like_op
 	recurrent_op)
+endif(WITH_PYTHON)
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -260,6 +260,12 @@ class OpRegistry {
    return CreateOp(op_desc.type(), inputs, outputs, attrs);
  }

+  static bool SupportGPU(const std::string& op_type) {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = platform::GPUPlace();
+    return OperatorWithKernel::AllOpKernels().at(op_type).count(key) != 0;
+  }
+
  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
    PADDLE_ENFORCE(!op.IsNetOp(),
                   "Use framework::Backward to get backward ops");
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -34,8 +34,8 @@ ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 #endif

 const std::string& OperatorBase::Input(const std::string& name) const {
-  PADDLE_ENFORCE(in_out_idxs_ != nullptr,
-                 "Input Output Indices could not be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_,
+                          "Input Output Indices could not be nullptr");
  auto it = in_out_idxs_->find(name);
  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
                 name);
@ -49,7 +49,7 @@ const std::string& OperatorBase::Input(const std::string& name) const {
 }

 std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
-  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "IO Idx could not be nullptr");
  auto input_format = GetAttr<std::vector<int>>("input_format");
  auto offset = in_out_idxs_->at(name);
  PADDLE_ENFORCE(input_format.at(static_cast<size_t>(offset) + 1) <=
@ -62,7 +62,7 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
 }

 const std::string& OperatorBase::Output(const std::string& name) const {
-  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "InOut Indice could not be nullptr");
  auto it = in_out_idxs_->find(name);
  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
                 name);
@ -76,7 +76,7 @@ const std::string& OperatorBase::Output(const std::string& name) const {
 }

 std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
-  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "InOut Indice could not be nullptr");
  auto output_format = GetAttr<std::vector<int>>("output_format");
  auto offset = in_out_idxs_->at(name);
  PADDLE_ENFORCE(output_format.at(static_cast<size_t>(offset) + 1) <=
--- a/Show More
+++ b/Show More