Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into improve_pruning

8 years ago · 15bf6e05b5
parent 1a82e7da9e 7bce40d7be
commit 15bf6e05b5
90 changed files with 2573 additions and 1057 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -47,6 +47,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
+option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -107,6 +108,7 @@ include(configure)          # add paddle env configuration
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")

 set(EXTERNAL_LIBS
    ${GFLAGS_LIBRARIES}
@ -126,9 +128,12 @@ endif(WITH_GPU)

 add_subdirectory(proto)
 add_subdirectory(paddle)
-add_subdirectory(go/master/c)
 add_subdirectory(python)
-add_subdirectory(go/pserver/cclient)
+
+if(WITH_GOLANG)
+    #TODO (add go/master/c back when fixed)
+    add_subdirectory(go/pserver/cclient)
+endif(WITH_GOLANG)

 if(WITH_DOC)
    add_subdirectory(doc)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -40,6 +40,10 @@ if(NOT CMAKE_CROSSCOMPILING)
    endif()
 endif()

+if(NOT WITH_GOLANG)
+    add_definitions(-DPADDLE_WITHOUT_GOLANG)
+endif(NOT WITH_GOLANG)
+
 if(NOT WITH_GPU)
    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -84,6 +84,7 @@ function(link_paddle_exe TARGET_NAME)
        paddle_parameter
        paddle_proto
        paddle_cuda
+        paddle_optimizer
        ${EXTERNAL_LIBS}
        ${CMAKE_THREAD_LIBS_INIT}
        ${CMAKE_DL_LIBS}
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
@ -11,13 +11,4 @@ include(flags)

 go_library(paddle_pserver_cclient STATIC)

-if(PROJ_ROOT)
-  add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a
-          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.h ${PROJ_ROOT}/paddle/trainer/
-          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.a ${PROJ_ROOT}/paddle/trainer/
-          WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-          DEPENDS paddle_pserver_cclient)
-  add_custom_target(paddle_pserver_cclient_lib ALL DEPENDS ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a)
-endif(PROJ_ROOT)
-
 add_subdirectory(test)
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -8,6 +8,7 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
+add_subdirectory(optimizer)
 add_subdirectory(strings)

 # Do not build go directory until go cmake is working smoothly.
@ -19,8 +20,8 @@ find_package(Boost QUIET)

 if(Boost_FOUND)
  include_directories(${Boost_INCLUDE_DIRS})
-  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-  add_subdirectory(majel)
+  add_subdirectory(platform)
+  add_subdirectory(framework)
 endif()

 if(WITH_C_API)
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@ -16,7 +16,7 @@ set(API_HEADER
    Internal.h)

 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp paddle_pserver_cclient_lib)
+add_dependencies(paddle_api gen_proto_cpp paddle_trainer_lib)

 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@ -842,7 +842,8 @@ public:
                                               int passCount,
                                               bool useSparseUpdater);
  static ParameterUpdater* createNewRemoteUpdater(
-      OptimizationConfig* config, const std::string pserverSpec);
+      OptimizationConfig* config,
+      const std::string pserverSpec) throw(UnsupportError);
  ~ParameterUpdater();

  /**
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@ -15,7 +15,9 @@ limitations under the License. */
 #include "PaddleAPI.h"

 #include "PaddleAPIPrivate.h"
+#ifndef PADDLE_WITHOUT_GOLANG
 #include "paddle/trainer/NewRemoteParameterUpdater.h"
+#endif
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"

@ -30,11 +32,16 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 }

 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
-    OptimizationConfig *config, const std::string pserverSpec) {
+    OptimizationConfig *config,
+    const std::string pserverSpec) throw(UnsupportError) {
+#ifndef PADDLE_WITHOUT_GOLANG
  auto updater = new ParameterUpdater();
  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
      config->m->getConfig(), pserverSpec));
  return updater;
+#else
+  throw UnsupportError();
+#endif
 }

 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
--- a/paddle/framework/.clang-format
+++ b/paddle/framework/.clang-format
@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -0,0 +1,4 @@
+cc_library(ddim SRCS ddim.cc)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
+
+nv_test(dim_test SRCS dim_test.cu DEPS ddim)
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -1,6 +1,7 @@
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"

-namespace majel {
+namespace paddle {
+namespace framework {

 ///@cond HIDDEN

@ -66,7 +67,7 @@ DDim make_ddim(const std::vector<int>& dims) {
 ///@cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
-public:
+ public:
  DynamicMutableIndexer(int idx) : idx_(idx) {}

  template <int D>
@ -74,12 +75,12 @@ public:
    return dim[idx_];
  }

-private:
+ private:
  int idx_;
 };

 class DynamicConstIndexer : public boost::static_visitor<int> {
-public:
+ public:
  DynamicConstIndexer(int idx) : idx_(idx) {}

  template <int D>
@ -87,7 +88,7 @@ public:
    return dim[idx_];
  }

-private:
+ private:
  int idx_;
 };

@ -213,10 +214,11 @@ struct DDimPrinter : boost::static_visitor<void> {

 ///\endcond

-std::ostream& operator<<(std::ostream& os, const majel::DDim& ddim) {
+std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  DDimPrinter printer(os);
  boost::apply_visitor(printer, ddim);
  return os;
 }

-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@ -5,20 +5,14 @@
 #include <stdexcept>
 #include <vector>

-#include "paddle/majel/dim.h"
+#include "paddle/framework/dim.h"

-namespace majel {
+namespace paddle {
+namespace framework {

 namespace {
-typedef boost::variant<Dim<1>,
-                       Dim<2>,
-                       Dim<3>,
-                       Dim<4>,
-                       Dim<5>,
-                       Dim<6>,
-                       Dim<7>,
-                       Dim<8>,
-                       Dim<9>>
+typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                       Dim<8>, Dim<9>>
    DDimVar;
 }

@ -95,14 +89,15 @@ ssize_t product(const DDim& ddim);

 int arity(const DDim& ddim);

-std::ostream& operator<<(std::ostream&, const majel::DDim&);
+std::ostream& operator<<(std::ostream&, const DDim&);

-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle

 namespace boost {

 template <typename T>
-T get(const majel::DDim& in) {
+T get(const paddle::framework::DDim& in) {
  return boost::get<T>(in.var);
 }

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@ -4,18 +4,18 @@
 #include <vector>

 #include "gtest/gtest.h"
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"

 TEST(DDim, Equality) {
  // construct a DDim from an initialization list
-  majel::DDim ddim = majel::make_ddim({9, 1, 5});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
  EXPECT_EQ(ddim[0], 9);
  EXPECT_EQ(ddim[1], 1);
  EXPECT_EQ(ddim[2], 5);

  // construct a DDim from a vector
  std::vector<int> vec({9, 1, 5});
-  majel::DDim vddim = majel::make_ddim(vec);
+  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
  EXPECT_EQ(ddim[0], 9);
  EXPECT_EQ(ddim[1], 1);
  EXPECT_EQ(ddim[2], 5);
@ -23,43 +23,43 @@ TEST(DDim, Equality) {
  // mutate a DDim
  ddim[1] = 2;
  EXPECT_EQ(ddim[1], 2);
-  majel::set(ddim, 0, 6);
-  EXPECT_EQ(majel::get(ddim, 0), 6);
+  paddle::framework::set(ddim, 0, 6);
+  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);

  // vectorize a DDim
-  std::vector<int> res_vec = majel::vectorize(vddim);
+  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
  EXPECT_EQ(res_vec[0], 9);
  EXPECT_EQ(res_vec[1], 1);
  EXPECT_EQ(res_vec[2], 5);
-  majel::Dim<3> d(3, 2, 1);
-  res_vec = majel::vectorize(majel::DDim(d));
+  paddle::framework::Dim<3> d(3, 2, 1);
+  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
  EXPECT_EQ(res_vec[0], 3);
  EXPECT_EQ(res_vec[1], 2);
  EXPECT_EQ(res_vec[2], 1);

  // add two DDims
-  majel::DDim ddim_sum = ddim + vddim;
+  paddle::framework::DDim ddim_sum = ddim + vddim;
  EXPECT_EQ(ddim_sum[0], 15);
  EXPECT_EQ(ddim_sum[1], 3);
  EXPECT_EQ(ddim_sum[2], 10);

  // multiply two DDims
-  majel::DDim ddim_mul = ddim * vddim;
+  paddle::framework::DDim ddim_mul = ddim * vddim;
  EXPECT_EQ(ddim_mul[0], 54);
  EXPECT_EQ(ddim_mul[1], 2);
  EXPECT_EQ(ddim_mul[2], 25);

  // arity of a DDim
-  EXPECT_EQ(majel::arity(ddim), 3);
+  EXPECT_EQ(paddle::framework::arity(ddim), 3);

  // product of a DDim
-  EXPECT_EQ(majel::product(vddim), 45);
+  EXPECT_EQ(paddle::framework::product(vddim), 45);
 }

 TEST(DDim, Print) {
  // print a DDim
  std::stringstream ss;
-  majel::DDim ddim = majel::make_ddim({2, 3, 4});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
  ss << ddim;
  EXPECT_EQ("2, 3, 4", ss.str());
 }
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@ -5,10 +5,11 @@
 #include <stdexcept>
 #include <type_traits>

-#include "paddle/majel/detail/cuda_assert.h"
-#include "paddle/majel/detail/hostdevice.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"

-namespace majel {
+namespace paddle {
+namespace framework {

 // Statically sized, statically indexed dimension
 template <int i>
@ -74,7 +75,7 @@ struct Dim<1> {
      throw std::invalid_argument("Index out of range.");
    }
 #else
-    MAJEL_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx < size.head);
 #endif
  }

@ -131,7 +132,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
    throw std::invalid_argument("Tried to access a negative dimension");
  }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
  if (idx == 0) {
    return dim.head;
@ -146,7 +147,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
    throw std::invalid_argument("Invalid index");
  }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
  return dim.head;
 }
@ -158,7 +159,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
    throw std::invalid_argument("Tried to access a negative dimension");
  }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
  if (idx == 0) {
    return dim.head;
@ -173,7 +174,7 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
    throw std::invalid_argument("Invalid index");
  }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
  return dim.head;
 }
@ -411,7 +412,7 @@ HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
 // XXX For some reason, overloading fails to resolve this correctly
 template <int i>
 typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
  os << d.head << ", " << d.tail;
  return os;
 }
@ -420,7 +421,7 @@ typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
 // XXX I wish this could be an overload instead of a template
 template <int i>
 typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
  os << d.head;
  return os;
 }
@ -448,4 +449,5 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
  return result;
 }

-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@ -0,0 +1,128 @@
+#include <thrust/device_vector.h>
+#include <sstream>
+
+#include "paddle/framework/dim.h"
+#include "gtest/gtest.h"
+
+__global__ void test(paddle::framework::Dim<2>* o) {
+    o[0] = paddle::framework::make_dim(5, 6);
+}
+
+__global__ void dyn_idx_gpu(int* o) {
+    auto d = paddle::framework::make_dim(5, 6);
+    o[0] = d[1];
+}
+
+TEST(Dim, Equality) {
+    // construct a Dim on the CPU
+    auto a = paddle::framework::make_dim(3, 4);
+    EXPECT_EQ(paddle::framework::get<0>(a), 3);
+    EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+    // construct a Dim on the GPU
+    thrust::device_vector<paddle::framework::Dim<2>> t(2);
+    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+    a = t[0];
+    EXPECT_EQ(paddle::framework::get<0>(a), 5);
+    EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+    // linearization
+    auto b = paddle::framework::make_dim(7, 8);
+    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+    // product
+    EXPECT_EQ(paddle::framework::product(a), 30);
+
+    // mutate a Dim
+    paddle::framework::get<1>(b) = 10;
+    EXPECT_EQ(paddle::framework::get<0>(b), 7);
+    EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+    // dynamic access
+    paddle::framework::get(b, 0) = 8;
+    b[1] = 11;
+    EXPECT_EQ(paddle::framework::get<0>(b), 8);
+    EXPECT_EQ(paddle::framework::get<1>(b), 11);
+    EXPECT_EQ(paddle::framework::get(b, 0), 8);
+    EXPECT_EQ(b[1], 11);
+
+    // dynamic access on GPU
+    thrust::device_vector<int> r(1);
+    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+    int res = r[0];
+    EXPECT_EQ(res, 6);
+
+    // ex_prefix_mul
+    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+    // contiguous_strides
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 0);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 10);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 0);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 2);
+    EXPECT_EQ(paddle::framework::get<2>(c), 6);
+
+    // generate from an index
+    auto size = paddle::framework::make_dim(4, 5, 2);
+    c = paddle::framework::Dim<3>(14, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 2);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::Dim<3>(25, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+}
+
+TEST(Dim, Bool) {
+    auto a = paddle::framework::make_dim(3, 4);
+    auto b = paddle::framework::make_dim(5, 6);
+    auto c = paddle::framework::make_dim(3, 4);
+
+    // in_bounds check
+    EXPECT_TRUE(paddle::framework::contained(a, b));
+    EXPECT_FALSE(paddle::framework::contained(b, a));
+
+    // comparison
+    EXPECT_TRUE(a == a);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a == c);
+
+    // contiguous check
+    int x = 4, y = 5, z = 2;
+    paddle::framework::Dim<3> sizef(x, y, z);
+    paddle::framework::Dim<3> stridea(1, x, x*y);
+    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
+    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
+    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
+}
+
+TEST(Dim, Print) {
+    {
+        std::stringstream ss;
+        auto a = paddle::framework::make_dim(2, 3);
+        ss << a;
+        EXPECT_EQ(ss.str(), "2, 3");
+    }
+    {
+        std::stringstream ss;
+        ss << paddle::framework::make_dim(8);
+        EXPECT_EQ(ss.str(), "8");
+    }
+}
--- a/paddle/framework/tensor.md
+++ b/paddle/framework/tensor.md
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@ -68,14 +68,12 @@ public:
    numOutputs_ = 1;
  }

-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
  // input can be INPUT and INPUT_GRAD
  // filter can be FILTER and FILTER_GRAD
  // output can be OUTPUT and OUTPUT_GRAD
-  void check(const TensorShape& input,
-             const TensorShape& filter,
-             const TensorShape& output) {
+  void checkShape(const TensorShape& input,
+                  const TensorShape& filter,
+                  const TensorShape& output) {
    // inputs and outputs arguments should be 4-dimensional.
    CHECK_EQ(input.ndims(), (size_t)4);
    CHECK_EQ(output.ndims(), (size_t)4);
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@ -117,15 +117,23 @@ public:
    ConvFunctionBase::init(config);
  }

+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
    // TODO(hedaoyuan): Need to define some index macros,
    // to avoid useing 0 and 1.
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
-    check(input, filter, output);

    real beta;
    if (outputs[0].getArgType() == ADD_TO) {
@ -209,16 +217,24 @@ public:
    ConvFunctionBase::init(config);
  }

+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
    // Since the implementation of Col2ImFunctor is ADD_TO,
    // this function only supports ADD_TO mode.
    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
    const TensorShape& output = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& input = outputs[0].shape();
-    check(input, filter, output);

    size_t batchSize = input[0];
    size_t inputChannels = input[1];
@ -295,13 +311,21 @@ public:
    ConvFunctionBase::init(config);
  }

+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
    const TensorShape& output = inputs[0].shape();
    const TensorShape& input = inputs[1].shape();
    const TensorShape& filter = outputs[0].shape();
-    check(input, filter, output);

    real beta;
    if (outputs[0].getArgType() == ADD_TO) {
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@ -54,8 +54,8 @@ public:
                  T inValue;
                  const int inH = inStartH + fH;
                  const int inW = inStartW + fW;
-                  if ((inH >= 0 && inH < inputHeight) &&
-                      (inW >= 0 && inW < inputWidth)) {
+                  if ((inH >= 0 && inH < (int)inputHeight) &&
+                      (inW >= 0 && inW < (int)inputWidth)) {
                    size_t offsetInput =
                        batch * inputChannels * inputHeight * inputWidth +
                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
@ -90,14 +90,19 @@ public:
    ConvFunctionBase::init(config);
  }

-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
-    check(input, filter, output);
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);

    size_t batchSize = inputs[0].shape()[0];
    size_t inputChannels = inputs[0].shape()[1];
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@ -284,6 +284,16 @@ public:
  }

 protected:
+  std::vector<Argument::SeqInfo> commonSeqInfo_;
+  ICpuGpuVectorPtr sequenceStartPositions_;
+  void calcSequenceStartPositions();
+  void checkInputConsistency(int inlinkId,
+                             const std::vector<Argument::SeqInfo>& seqInfo);
+  void reorganizeInput(PassType passType);
+  void reorganizeOutput(PassType passType);
+  void connectFrames(PassType passType);
+  void calcNumSequencesAtEachStep();
+
  void resizeOrCreateFrames(int numFrames);
  void resizeBootFrame(int numSequences);

@ -295,8 +305,7 @@ protected:
    std::string linkName;
    LayerPtr inLayer;
    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    bool hasSubseq;
-    Argument outArg;  // scatter output argument
+    Argument outArg;               // scatter output argument
  };
  std::vector<InFrameLine> inFrameLines_;

@ -318,7 +327,6 @@ protected:
    std::vector<LayerPtr> agents;
    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
    Argument outArg;                      // scatter output argument
-    bool is_sequence;
    // Different memoryFrameLine have different element as follows
    IVectorPtr allIds;  // scattered id of realLayer
    ICpuGpuVectorPtr
@ -330,22 +338,27 @@ protected:
  // and all outFrameLines(outlinks) share the info with one inFrameLine,
  // which is assigned by targetInfoInlinkId_.
  struct Info {
-    IVectorPtr allIds;         // scattered id of realLayer
-    std::vector<int> idIndex;  // index of allIds
+    // The original positions in the original batch
+    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
+
+    // index of allIds for each step [maxSequenceLength_]
+    // idIndex[i] is the total length of the first i sequences
+    std::vector<int> idIndex;
+
    ICpuGpuVectorPtr
        sequenceStartPositions;         // scattered sequenceStartPositions
    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
  };
-  std::vector<Info> info_;
+  std::vector<Info> info_;  // for input

  // numSeqs_[i] is the number sequences which is longer than i (for sequence
  // data) or has more than i subsequences (for subsequence data)
+  // Equivalently, numSeqs_[i] is the number of sequences at step i;
  std::vector<int> numSeqs_;

  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;

-  // the id of inlink which share info with outlinks
-  int targetInfoInlinkId_;
+  void checkOutputConsistency(OutFrameLine& outFrameLine);

  /* create scattered id infomation for all realLayer of inFrameLines one time.
   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
@ -354,6 +367,28 @@ protected:
  void createInFrameInfo(int inlinks_id,
                         const Argument& input,
                         PassType passType);
+  void createInFrameInfo_nonseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+  void createInFrameInfo_seq(int inlinks_id,
+                             const Argument& input,
+                             PassType passType);
+  void createInFrameInfo_subseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+
+  void createOutFrameInfo(OutFrameLine& outFrameLine,
+                          Info& info,
+                          ICpuGpuVectorPtr& sequenceStartPositions,
+                          ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
+                              Info& info,
+                              ICpuGpuVectorPtr& sequenceStartPositions,
+                              ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
+                                 Info& info,
+                                 ICpuGpuVectorPtr& sequenceStartPositions,
+                                 ICpuGpuVectorPtr& subSequenceStartPositions);

  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                             PassType passType);
@ -386,9 +421,7 @@ protected:
  NeuralNetwork* rootNetwork_;
  bool reversed_;

-  // if hasSubseq: max number of sentences(subseq)in batchsize samples
-  // else: max number of tokens in batchsize samples(sentences)
-  int maxSequenceLength_;
+  int maxSequenceLength_;  // Max top-level length
  bool useGpu_;
  bool stopBeamSearch_;

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@ -36,14 +36,23 @@ void AgentLayer::forward(PassType passType) {
  Layer::forward(passType);

  Argument& realOutput = realLayer_->getOutput();
-  int realHeight = realOutput.getBatchSize();
-  CHECK_LE(numSamples_, realHeight);
+  int realNumSequences = realOutput.getNumSequences();
+  CHECK_LE(numSamples_, realNumSequences);

  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realHeight) {
-    if (realOutput.ids) {
-      output_.ids =
-          IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
+  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
+    if (realOutput.hasSeq()) {
+      int numRows =
+          realOutput.sequenceStartPositions->getData(false)[numSamples_];
+      output_.subArgFrom(realOutput,
+                         /* offset */ 0,
+                         numRows,
+                         getSize(),
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ 0,
+                         /* seqSize */ numSamples_ + 1);
    } else {
      output_.subArgFrom(
          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
@ -53,34 +62,6 @@ void AgentLayer::forward(PassType passType) {
  }
 }

-void SequenceAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  Argument& realOutput = realLayer_->getOutput();
-  int realNumSequences = realOutput.getNumSequences();
-  CHECK_LE(numSamples_, realNumSequences);
-
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows =
-        realOutput.sequenceStartPositions->getData(false)[numSamples_];
-    CHECK(!realOutput.ids) << "Not supported";
-    output_.subArgFrom(realOutput,
-                       /* offset */ 0,
-                       numRows,
-                       getSize(),
-                       useGpu_,
-                       /* trans */ false,
-                       /* seqFlag */ true,
-                       /* seqStart */ 0,
-                       /* seqSize */ numSamples_ + 1);
-  } else {
-    output_ = realOutput;
-  }
-}
-
-REGISTER_LAYER(sequence_agent, SequenceAgentLayer);
-
 bool GatherAgentLayer::init(const LayerMap& layerMap,
                            const ParameterMap& parameterMap) {
  CHECK_EQ(config_.inputs_size(), 0);
@ -91,18 +72,26 @@ bool GatherAgentLayer::init(const LayerMap& layerMap,
  return true;
 }

-void GatherAgentLayer::copyIdAndSequenceInfo(const Argument& input,
-                                             const IVectorPtr& ids,
-                                             const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = input.sequenceStartPositions;
-  output_.subSequenceStartPositions = input.subSequenceStartPositions;
-  realLayers_.clear();
+void GatherAgentLayer::copyIdAndSequenceInfo(
+    ICpuGpuVectorPtr sequenceStartPositions,
+    ICpuGpuVectorPtr subSequenceStartPositions,
+    const IVectorPtr& ids,
+    const std::vector<int>& idIndex) {
+  output_.sequenceStartPositions = sequenceStartPositions;
+  output_.subSequenceStartPositions = subSequenceStartPositions;
  allIds_ = ids;
  idIndex_ = idIndex;
 }

 void GatherAgentLayer::forward(PassType passType) {
  Layer::forward(passType);
+  forwardIds(passType);
+  forwardValue(passType);
+}
+
+void GatherAgentLayer::forwardValue(PassType passType) {
+  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
+  if (!valueReal) return;

  int height = allIds_->getSize();
  int width = this->getSize();
@ -147,7 +136,9 @@ void ScatterAgentLayer::forward(PassType passType) {
  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());

  int width = this->getSize();
-  if (realOutArg_.value || realOutArg_.ids) {
+  if (realOutArg_.hasSeq()) {
+    forwardSequence(passType);
+  } else if (realOutArg_.value || realOutArg_.ids) {
    output_.subArgFrom(
        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
  } else {  // used in generation
@ -174,7 +165,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
  if (realGrad) {
    // for agent in inFrameLines and memoryFrameLines,
    // only first scatterAgentLayer should do addToRows in backward
-    if (idIndex_ == 0) {
+    if (handleBackward_) {
      outputGrad->addToRows(*realGrad, *ids_);
    }
  }
@ -183,12 +174,14 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
 REGISTER_LAYER(gather_agent, GatherAgentLayer);
 REGISTER_LAYER(scatter_agent, ScatterAgentLayer);

-void SequenceGatherAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
+void GatherAgentLayer::forwardIds(PassType passType) {
  int height = 0;
-  int* starts = output_.subSequenceStartPositions->getMutableData(false);
  IVectorPtr idReal = realLayers_[0]->getOutputLabel();
-  if (idReal) {
+
+  if (!idReal) return;
+
+  if (output_.subSequenceStartPositions) {
+    int* starts = output_.subSequenceStartPositions->getMutableData(false);
    // Gather generator.idsVec
    // if is beam search generation result. Get first result.
    if (idReal->getData()[idReal->getSize() - 1] == -1) {
@ -212,13 +205,11 @@ void SequenceGatherAgentLayer::forward(PassType passType) {
          ->copyFrom(*realLayers_[i]->getOutputLabel());
    }
  } else {
-    // Gather output.value, same as GatherAgentLayer
-    CHECK(output_.subSequenceStartPositions);
-    GatherAgentLayer::forward(passType);
+    LOG(FATAL) << "Not implemented";
  }
 }

-void SequenceScatterAgentLayer::forward(PassType passType) {
+void ScatterAgentLayer::forwardSequence(PassType passType) {
  Layer::forward(passType);
  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());

@ -241,6 +232,7 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
                       /* seqStart */ seqStartPosIndex_,
                       /* seqSize */ numSequences_);
  } else {
+    // Putting the generation logic here is really an ugly hack!
    // used in generation
    int height = 0;
    size_t numSequences = ids_->getSize();
@ -284,7 +276,4 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
  }
 }

-REGISTER_LAYER(sequence_gather_agent, SequenceGatherAgentLayer);
-REGISTER_LAYER(sequence_scatter_agent, SequenceScatterAgentLayer);
-
 }  // namespace paddle
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@ -49,18 +49,6 @@ public:
  void backward(const UpdateCallback& callback = nullptr) override {}
 };

-/**
- * like AgentLayer, but use first *numSamples* sequences
- */
-class SequenceAgentLayer : public AgentLayer {
-public:
-  explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
-  ~SequenceAgentLayer() {}
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
 /**
 * Like AgentLayer, but it can gather many real layers. Each real
 * layer give a few rows of a sequence, after gather all real layers,
@ -83,7 +71,10 @@ public:
            const ParameterMap& parameterMap) override;

  // call before addRealLayer
-  void copyIdAndSequenceInfo(const Argument& input,
+  void clearRealLayers() { realLayers_.clear(); }
+
+  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
+                             ICpuGpuVectorPtr subSequenceStartPositions,
                             const IVectorPtr& allIds,
                             const std::vector<int>& idIndex);

@ -92,24 +83,8 @@ public:

  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback) override;
-};
-
-/**
- * Like GatherAgentLayer, but select a few sequence in real layer.
- * *ids* in addRealLayer() are the ids of selected sequence.
- * It's used to reorder sequence output.
- */
-class SequenceGatherAgentLayer : public GatherAgentLayer {
-public:
-  explicit SequenceGatherAgentLayer(const LayerConfig& config)
-      : GatherAgentLayer(config) {}
-  virtual ~SequenceGatherAgentLayer() {}
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    // same as GatherAgentLayer
-    GatherAgentLayer::backward(callback);
-  }
+  void forwardValue(PassType passType);
+  void forwardIds(PassType passType);
 };

 /**
@ -129,6 +104,11 @@ protected:
  int idSize_;
  int seqStartPosIndex_;
  int numSequences_;  // number of sequences in this scatterAgentLayer
+  bool handleBackward_;
+
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  ICpuGpuVectorPtr inputStartPos_;

 public:
  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
@ -147,19 +127,15 @@ public:
   *                        false(default) in ScatterAgentLayer, and
   *                        true in SequenceScatterAgentLayer.
   */
-  void setRealLayer(LayerPtr layer,
-                    const std::vector<int>& ids,
-                    bool copyId = false) {
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
    realLayer_ = layer;
    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
    ids_->copyFrom(ids.data(), ids.size());
-    if (copyId) {
-      if (useGpu_) {
-        IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-        cpuIds_->copyFrom(ids.data(), ids.size());
-      } else {
-        cpuIds_ = ids_;
-      }
+    if (useGpu_) {
+      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
+      cpuIds_->copyFrom(ids.data(), ids.size());
+    } else {
+      cpuIds_ = ids_;
    }
  }

@ -169,12 +145,14 @@ public:
                             const Argument& outArg,
                             const IVectorPtr& ids,
                             int idIndex,
-                             int idSize) {
+                             int idSize,
+                             bool handleBackward) {
    realLayer_ = layer;
    realOutArg_ = outArg;
    ids_ = ids;
    idIndex_ = idIndex;
    idSize_ = idSize;
+    handleBackward_ = handleBackward;
  }

  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
@ -187,28 +165,8 @@ public:

  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback) override;
-};

-/**
- * Like ScatterAgentLayer, but select a few sequence in real layer.
- * *ids* in setRealLayer() or setRealLayerAndOutput() are the ids of
- * selected sequence. It's used to reorder sequence input.
- */
-class SequenceScatterAgentLayer : public ScatterAgentLayer {
-protected:
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-public:
-  explicit SequenceScatterAgentLayer(const LayerConfig& config)
-      : ScatterAgentLayer(config) {}
-  virtual ~SequenceScatterAgentLayer() {}
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    ScatterAgentLayer::backward(callback);
-  }
+  void forwardSequence(PassType passType);
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@ -40,6 +40,7 @@ namespace paddle {
 class FeatureMapExpandLayer : public Layer {
 private:
  int numFilters_;
+  bool asRowVector_;

 public:
  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
@ -62,6 +63,7 @@ bool FeatureMapExpandLayer::init(const LayerMap& layerMap,

  CHECK_EQ(inputLayers_.size(), 1UL);
  numFilters_ = config_.num_filters();
+  asRowVector_ = config_.user_arg() != "as_col_vec";
  return true;
 }

@ -76,16 +78,30 @@ void FeatureMapExpandLayer::forward(PassType passType) {

  {
    AsyncGpuBlock asyncGpuBlock;
-    for (size_t i = 0; i < batchSize; i++) {
-      MatrixPtr outVTmp =
-          Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                         numFilters_,
-                         imgSize,
-                         false,
-                         useGpu_);
-      MatrixPtr inVTmp = Matrix::create(
-          inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-      outVTmp->addRowVector(*inVTmp);
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        outVTmp->addRowVector(*inVTmp);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        outVTmp->addColVector(*inVTmp);
+      }
    }
  }
  /* activation */ {
@ -102,24 +118,38 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
  MatrixPtr outGrad = getOutputGrad();
  size_t batchSize = getInput(0).getBatchSize();
  int imgSize = inGrad->getWidth();
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
  {
    AsyncGpuBlock asyncGpuBlock;
-    for (size_t i = 0; i < batchSize; i++) {
-      MatrixPtr outGradTmp =
-          Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                         numFilters_,
-                         imgSize,
-                         false,
-                         useGpu_);
-      MatrixPtr inGradTmp = Matrix::create(
-          inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-      inGradTmp->collectBias(*outGradTmp, 1);
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        inGradTmp->collectBias(*outGradTmp, 1);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        inGradTmp->sumRows(*outGradTmp, 1, 1);
+      }
    }
  }
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
 }

 }  // namespace paddle.
--- a/Show More
+++ b/Show More