Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_glog

8 years ago · ce0d67040d
parent c78f41a331 8e8f3601a5
commit ce0d67040d
54 changed files with 474 additions and 1319 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -21,3 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
+-   repo: https://github.com/dnephin/pre-commit-golang
+    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    hooks:
+      -   id: go-fmt
+          files: (.*\.go)
+      -   id: go-lint
+          files: (.*\.go)
--- a/.travis.yml
+++ b/.travis.yml
@ -33,16 +33,17 @@ addons:
      - ccache
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
  - pip install rarfile
+  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
-    export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
+    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
  email:
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -17,6 +17,65 @@ INCLUDE(ExternalProject)
 FIND_PACKAGE(Protobuf QUIET)
 SET(PROTOBUF_FOUND "OFF")

+if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
+    function(protobuf_generate_python SRCS)
+        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+        if(NOT ARGN)
+            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+            return()
+        endif()
+
+        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            # Create an include path for each file specified
+            foreach(FIL ${ARGN})
+                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        else()
+            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+        endif()
+
+        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+        endif()
+
+        if(DEFINED Protobuf_IMPORT_DIRS)
+            foreach(DIR ${Protobuf_IMPORT_DIRS})
+                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        endif()
+
+        set(${SRCS})
+        foreach(FIL ${ARGN})
+            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+            get_filename_component(FIL_WE ${FIL} NAME_WE)
+            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+                if(FIL_DIR)
+                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+                endif()
+            endif()
+
+            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+            add_custom_command(
+                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
+                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                    VERBATIM )
+        endforeach()
+
+        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+    endfunction()
+endif()

 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -99,25 +99,44 @@ function(merge_static_libs TARGET_NAME)
  set(libs ${ARGN})
  list(REMOVE_DUPLICATES libs)

-  # First get the file names of the libraries to be merged
+  # Get all propagation dependencies from the merged libraries
  foreach(lib ${libs})
-    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()

  if(APPLE) # Use OSX's libtool to merge archives
+    # To produce a library we need at least one source file.
+    # It is created by add_custom_command below and will helps 
+    # also help to track dependencies.
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
-	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
    foreach(lib ${libs})
      set(objlistfile ${lib}.objlist) # list of objects in the input library
      set(objdir ${lib}.objdir)

      add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})

      add_custom_command(OUTPUT ${objlistfile}
        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
@ -125,27 +144,27 @@ function(merge_static_libs TARGET_NAME)
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})

-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		

      list(APPEND mergebases "${mergebase}")
    endforeach()

-    # We need a target for the output merged library
    add_library(${TARGET_NAME} STATIC ${mergebases})
+    target_link_libraries(${TARGET_NAME} ${libs_deps}) 
+
+    # Get the file name of the generated library
    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")

    foreach(lib ${libs})
      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
-      WORKING_DIRECTORY ${lib}.objdir)
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o  
+        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${lib}.objdir)
    endforeach()
-
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_RANLIB} ${outlibfile})
  endif()
 endfunction(merge_static_libs)

@ -194,7 +213,7 @@ function(cc_test TARGET_NAME)
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_test(${TARGET_NAME} ${TARGET_NAME})
+    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)

@ -281,10 +300,11 @@ function(go_library TARGET_NAME)

  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  # FIXME: link path
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
    COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
    # Golang build source code
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
    -o "${${TARGET_NAME}_LIB_PATH}"
    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
    # must run under GOPATH
@ -299,11 +319,13 @@ function(go_binary TARGET_NAME)
  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})

+  # FIXME: link path
  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+      COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
+      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
-  WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
  # TODO: don't know what ${TARGET_NAME}_link does
  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
@ -332,3 +354,12 @@ function(proto_library TARGET_NAME)
  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()
+
+function(py_proto_compile TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(py_srcs)
+  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
+endfunction()
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-go_binary(master SRC master.go)
+go_binary(master SRC master.go DEPS paddle_go_optimizer)
--- a/go/cmd/pserver/CMakeLists.txt
+++ b/go/cmd/pserver/CMakeLists.txt
@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-go_binary(pserver SRCS pserver.go)
+go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@ -104,11 +104,22 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }

+// return value:
+//     0:ok
+//    -1:error
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
-	r := c.NextRecord()
+	r, err := c.NextRecord()
+	if err != nil {
+		// Error
+		// TODO: return the type of error?
+		*record = (*C.uchar)(nullPtr)
+		return -1
+	}
+
 	if len(r) == 0 {
+		// Empty record
 		*record = (*C.uchar)(nullPtr)
 		return 0
 	}
--- a/go/master/client.go
+++ b/go/master/client.go
@ -11,7 +11,12 @@ import (
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
-	ch   chan []byte
+	ch   chan record
+}
+
+type record struct {
+	r   []byte
+	err error
 }

 // NewClient creates a new Client.
@ -21,7 +26,7 @@ type Client struct {
 func NewClient(addrCh <-chan string, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan []byte, bufSize)
+	c.ch = make(chan record, bufSize)
 	go c.monitorMaster(addrCh)
 	go c.getRecords()
 	return c
@ -46,10 +51,11 @@ func (c *Client) getRecords() {

 			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
 			for s.Scan() {
-				c.ch <- s.Record()
+				c.ch <- record{s.Record(), nil}
 			}

 			if s.Err() != nil {
+				c.ch <- record{nil, s.Err()}
 				log.Errorln(err, chunk.Path)
 			}

@ -116,6 +122,7 @@ func (c *Client) taskFinished(taskID int) error {
 //
 // NextRecord will block until the next record is available. It is
 // thread-safe.
-func (c *Client) NextRecord() []byte {
-	return <-c.ch
+func (c *Client) NextRecord() ([]byte, error) {
+	r := <-c.ch
+	return r.r, r.err
 }
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@ -68,12 +68,17 @@ func TestNextRecord(t *testing.T) {
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {
-			r := c.NextRecord()
+			r, err := c.NextRecord()
+			if err != nil {
+				t.Fatal(pass, i, "Read error:", err)
+			}
+
 			if len(r) != 1 {
-				t.Fatal("Length should be 1.", r)
+				t.Fatal(pass, i, "Length should be 1.", r)
 			}
+
 			if received[r[0]] {
-				t.Fatal("Received duplicate.", received, r)
+				t.Fatal(pass, i, "Received duplicate.", received, r)
 			}
 			received[r[0]] = true
 		}
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@ -1,5 +1,8 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
+target_link_libraries(paddle_go_optimizer stdc++ m)
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
-  add_subdirectory(test)
+  # FIXME: this test requires pserver which is not managed by the test
+  # we need some kind of e2e testing machanism.
+  # add_subdirectory(test)
 endif()
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@ -1,2 +1,2 @@
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
 add_style_check_target(test_cclient test_cclient.c)
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@ -2,7 +2,7 @@ package pserver

 // #cgo CFLAGS: -I ../../
 // //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
@ -56,8 +56,8 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {

 func (o *optimizer) GetWeights() []byte {
 	var buffer unsafe.Pointer
-	buffer_len := C.paddle_optimizer_get_weights(o.opt, &buffer)
-	return cArrayToSlice(buffer, int(buffer_len)*C.sizeof_float)
+	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
+	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }

 func (o *optimizer) UpdateParameter(g Gradient) error {
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@ -10,8 +10,10 @@ import (
 type ElementType int

 const (
+	// AlreadyInitialized is true if pserver is initialized
 	AlreadyInitialized = "pserver already initialized"
-	Uninitialized      = "pserver not fully initialized"
+	// Uninitialized is true if pserver not fully initialized
+	Uninitialized = "pserver not fully initialized"
 )

 // Supported element types
@ -55,7 +57,7 @@ func NewService(idx int) (*Service, error) {
 	s := &Service{
 		idx: idx,
 	}
-  s.optMap = make(map[string]*optimizer)
+	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
 	return s, nil
 }
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
    paddle_trainer_lib
    paddle_network
    paddle_parameter
+    paddle_optimizer
    paddle_math
    paddle_utils
    paddle_proto
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -9,6 +9,9 @@ cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
-
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+add_dependencies(framework_py_proto framework_py_proto_init)
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@ -25,6 +25,10 @@ namespace paddle {
 * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
 }

+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 void CrossChannelNormLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inV = getInputValue(0);
@ -51,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-  normBuffer_->zeroMem();
-  // add eps to avoid overflow
-  normBuffer_->addScalar(*normBuffer_, 1e-6);
+
  inV->square2(*dataBuffer_);
  for (size_t i = 0; i < batchSize; i++) {
    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
@ -63,6 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) {

    // compute norm.
    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    // add eps to avoid overflow
+    spatialBuffer_->add(1e-6);
    spatialBuffer_->sqrt2(*spatialBuffer_);
    normTmp->copyFrom(*spatialBuffer_);
    outVTmp->copyFrom(*inVTmp);
@ -82,6 +92,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
  size_t dataDim = inG->getWidth();
  size_t spatialDim = dataDim / channels_;

+  MatrixPtr inGBuffer;
+  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
+
  dataBuffer_->dotMul(*outG, *outV);
  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
@ -100,22 +113,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
    scaleDiff_->add(*channelBuffer_, 1.);

    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
    // scale the grad
-    inGTmp->copyFrom(*inVTmp);
-    inGTmp->mulRowVector(*spatialBuffer_);
+    inGBuffer->copyFrom(*inVTmp);
+    inGBuffer->mulRowVector(*spatialBuffer_);
    // divide by square of norm
    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGTmp->divRowVector(*spatialBuffer_);
+    inGBuffer->divRowVector(*spatialBuffer_);
    // subtract
-    inGTmp->add(*outGTmp, -1, 1);
+    inGBuffer->add(*outGTmp, -1, 1);
    // divide by norm
-    inGTmp->divRowVector(*normTmp);
+    inGBuffer->divRowVector(*normTmp);
    // scale the diff
-    inGTmp->mulColVector(*scale_->getW());
+    inGBuffer->mulColVector(*scale_->getW());
+
+    inGTmp->add(*inGBuffer);
  }
  // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
  scale_->getParameterPtr()->incUpdate(callback);
 }

--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@ -26,6 +26,10 @@ namespace paddle {
 * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
  return true;
 }

-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
 }  // namespace paddle
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@ -26,10 +26,9 @@ namespace paddle {
 * If SequenceLevel = kNonseq:
 *   Output: a sequence containing only the last instance of the input sequence
 *   If stride_ > 0:
- *      Output: a shorten sequence. The operation of getting last instance of a
- *              sequence is independently performed on every slice of the input
- *              sequence, which is obtained by sliding a window with the window
- *              size set to stride_.
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and getting last instance
+ *              operation is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *   Check input sequence must has sub-sequence
 *   Output: a sequence containing only the last instance of each sub-sequence
@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
  SequencePoolLayer::forward(passType);

-  auto starts = (stride_ > 0) ? stridePositions_->getData()
-                              : startPositions_->getData(false);
+  auto starts = startPositions_->getData(false);
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();

--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
  if (stride_ > 0) {
    CHECK_EQ(input.hasSubseq(), 0UL)
        << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(
-        input, stride_, &stridePositions_, reversed_);
-    newBatchSize_ = stridePositions_->getSize() - 1;
+    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
+    newBatchSize_ = startPositions_->getSize() - 1;
  }

  resetOutput(newBatchSize_, dim);
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@ -28,8 +28,9 @@ namespace paddle {
 * sequence}{input[i]}
 *    If stride_ > 0:
 *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence, pooling is performed upon a small local
- *                area
+ *        Output: a shorten sequence. Stride is the step size by which we slide
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
@ -47,8 +48,6 @@ protected:
  size_t newBatchSize_;
  ICpuGpuVectorPtr startPositions_;
  int stride_;
-  // Store the start position of each window.
-  IVectorPtr stridePositions_;
  // Whether the input sequence is reversed or not.
  bool reversed_ = false;

--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,

    std::vector<Argument> args;
    args.push_back(out);
-    EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
+    ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
      start[seqId] += seqLens[seqId];
    }
@ -465,7 +465,6 @@ void initTestLayer(TestConfig testConf,
                           ParameterConfig paraConfig) {
    paraConfig.set_name(paraName);
    paraConfig.set_size(paraSize);
-    paraConfig.set_initial_std(1);
    paraConfig.set_is_static(isStatic);
    auto para =
        std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
@ -499,6 +498,9 @@ void initTestLayer(TestConfig testConf,
        paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
        paraConfig.add_dims(testConf.layerConfig.size());
      }
+      CHECK_GE(testConf.paramInitialStd, 0);
+      paraConfig.set_initial_mean(testConf.paramInitialMean);
+      paraConfig.set_initial_std(testConf.paramInitialStd);
      initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
    }
  }
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@ -125,12 +125,16 @@ struct TestConfig {
  LayerConfig layerConfig;
  std::vector<InputDef> inputDefs;
  size_t biasSize;
+  real paramInitialMean;
+  real paramInitialStd;
  bool testAccumulate;
  bool testState;
  bool staticBias;
  bool testBatchState;
  TestConfig()
      : biasSize(0),
+        paramInitialMean(0.0),
+        paramInitialStd(1.0),
        testAccumulate(true),
        testState(false),
        staticBias(false),
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq,

 TEST(Layer, MaxLayer) {
  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
 }

 TEST(Layer, SequenceLastInstanceLayer) {
@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) {

 TEST(Layer, AverageLayer) {
  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(false,
+                   "average",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
  testDegradeLayer(
      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
@ -1661,6 +1669,8 @@ TEST(Layer, PadLayer) {

 TEST(Layer, CrossChannelNormLayer) {
  TestConfig config;
+  config.paramInitialMean = 1.;
+  config.paramInitialStd = 0.;
  config.layerConfig.set_type("norm");
  config.layerConfig.set_size(100);
  LayerInputConfig* input = config.layerConfig.add_inputs();
@ -1674,7 +1684,7 @@ TEST(Layer, CrossChannelNormLayer) {
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});

  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
  }
 }

--- a/Show More
+++ b/Show More