Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-dist-sparse-decay

test=develop
7 years ago · d0e3b24002
parent c3b9edf958 223cc89f0b
commit d0e3b24002
47 changed files with 1239 additions and 197 deletions
--- a/cmake/FindJeMalloc.cmake
+++ b/cmake/FindJeMalloc.cmake
@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
 mark_as_advanced(
  JEMALLOC_LIBRARIES
  JEMALLOC_INCLUDE_DIR)
+
+if (JEMALLOC_FOUND)
+  add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
+  set_target_properties(jemalloc::jemalloc PROPERTIES
+    IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
+endif()
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -2,9 +2,11 @@ if(NOT WITH_GPU)
    return()
 endif()

-set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75")
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")

 ######################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@ -155,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
+elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
+elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()

 include_directories(${CUDA_INCLUDE_DIRS})
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -23,11 +23,8 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
-if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
-    message(STATUS "use pre defined download url")
-    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
-endif()
+set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)

 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -55,7 +55,7 @@ ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
-    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_REPOSITORY      "https://github.com/intel/mkl-dnn.git"
    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})

+IF(APPLE)
+    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
+    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-if(WIN32)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+SET(TIME_VERSION "2019.0.1.20181227")
+IF(WIN32)
+    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-else()
+ELSE()  
+    SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
+    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
-endif()
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
-
-IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
-    MESSAGE(STATUS "use pre defined download url")
-    if(WIN32)
-        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
-    elseif(APPLE)
-        SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    else()
-        SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
-        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    ENDIF()
-endif()
+ENDIF()

 SET(MKLML_PROJECT       "extern_mklml")
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
  endif()

  if (WITH_JEMALLOC)
-    target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES})
+    target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
  endif()
 endfunction()

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass
-        memory_optimize_pass)
+        memory_optimize_pass lock_free_optimize_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
+USE_PASS(lock_free_optimize_pass);
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)

 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
+pass_library(lock_free_optimize_pass base)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@ -0,0 +1,130 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
+#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
+
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node;
+
+/*
+* Remove the sum op of all gradients of the backward op.
+* And remove the dependecies of the optimizer related to the
+* same backward op.
+*
+* Before this pass:
+*
+* forward_op1 forward_op2
+*     |            |
+*  grad_op1    grad_op2
+*        \      /
+*          \  /
+*         sum_op
+*           |
+*         sgd_op
+*
+* After this pass:
+* forward_op1 forward_op2
+*     |            |
+*  grad_op1    grad_op2
+*     |            |
+*  sgd_op1      sgd_op2
+*
+* sgd_op1 and sgd_op2 will update the same weight which holds the same
+* memory, so we could benefits from the acceleration
+*/
+class LockFreeOptimizePass : public Pass {
+ public:
+  virtual ~LockFreeOptimizePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+ private:
+  // Create a new sgd node via current optimizer node
+  ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
+                             ir::Node* backward_node, ir::Node* grad_sum_node,
+                             ir::Node* optimize_node) const;
+
+  // Replace the input weight's optimizers
+  void ReplaceUpstreamNode(ir::Node* upstream_node,
+                           ir::Node* old_optimizer_node,
+                           ir::Node* new_optimizer_node) const;
+
+  // Replace the output weight's optimizers
+  void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
+                                ir::Node* new_optimizer_node) const;
+
+  // Find all weight variables in graph
+  bool FindAllWeightVars(ir::Graph* graph) const;
+
+  // Find the forward_op node via the backward_op node
+  ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
+                                       ir::Node* backward_node) const;
+
+  std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
+                                           ir::Node* downstream_node) const;
+
+  inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kOperation && node->Name() == name;
+  }
+
+  inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kVariable && node->Name() == name;
+  }
+
+  inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kVariable &&
+           boost::algorithm::ends_with(node->Name(), name);
+  }
+
+  inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
+    PADDLE_ENFORCE(node);
+
+    return node->NodeType() == Node::Type::kVariable &&
+           node->Name().find(name) != std::string::npos;
+  }
+
+  inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
+    PADDLE_ENFORCE(ctrl_dep_node);
+    PADDLE_ENFORCE(node);
+
+    return IsControlDepVar(*ctrl_dep_node) &&
+           ctrl_dep_node->inputs.size() >= 1u &&
+           ctrl_dep_node->inputs[0] == node;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+#endif  // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
 }

 Variable* Scope::Var(std::string* name) {
-  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  SCOPE_VARS_WRITER_LOCK
+  auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
+                  std::to_string(vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
-  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }

--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {

 }  // namespace detail

-const std::type_index &ToTypeIndex(int var_id) {
+const std::type_index &VarTraitIdToTypeIndex(int var_id) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
 }

-const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+const char *ToTypeName(int var_id) {
+  return VarTraitIdToTypeIndex(var_id).name();
+}

-int ToTypeId(const std::type_index &type) {
+int TypeIndexToVarTraitId(const std::type_index &type) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
 }

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@ -66,8 +66,8 @@ namespace paddle {
 namespace framework {

 const char *ToTypeName(int var_id);
-const std::type_index &ToTypeIndex(int var_id);
-int ToTypeId(const std::type_index &type);
+const std::type_index &VarTraitIdToTypeIndex(int var_id);
+int TypeIndexToVarTraitId(const std::type_index &type);

 namespace detail {

--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@ -45,10 +45,11 @@ struct TypeIndexChecker {
    constexpr auto kId = VarTypeTrait<Type>::kId;
    std::type_index actual_type(typeid(Type));
    EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
-    EXPECT_EQ(ToTypeIndex(kId), actual_type);
-    EXPECT_EQ(ToTypeId(actual_type), kId);
-    EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
-    EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
+    EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
+    EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
+    EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
+              actual_type);
+    EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);

    EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
    EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
       i++) {
    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
              << " result: " << result[i];
-    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
-                   result[i]);
+    EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
+                1e-3);
  }
 }

--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
        ir_graph_build_pass
        ir_analysis_pass
        analysis_passes
+        subgraph_detector
        CACHE INTERNAL "")
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@ -190,6 +190,26 @@ void BenchGRUKernel() {
  }
 }

+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchSeqPoolKernel() {
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
+  for (auto type : pool_types) {
+    for (int w : TestSizes()) {
+      jit::seq_pool_attr_t attr(w, type);
+      for (int h : TestSizes()) {
+        attr.h = h;
+        std::vector<T> x(h * w), y(w);
+        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
+        const T* x_data = x.data();
+        T* y_data = y.data();
+        BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
+                                                            y_data, &attr);
+      }
+    }
+  }
+}
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
+
+  // seq pool function
+  BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
 }
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
 USE_JITKERNEL_GEN(kGRUHtPart1)
 USE_JITKERNEL_GEN(kGRUHtPart2)
 USE_JITKERNEL_GEN(kNCHW16CMulNC)
+USE_JITKERNEL_GEN(kSeqPool)
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/seqpool.h"
+#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void SeqPoolJitCode::genCode() {
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 8;
+  const int num_block = w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  int rest_num_regs = num_block % max_num_regs;
+  mov(reg32_int_h, dword[param_attr]);
+  if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+    mov(reg_tmp, reinterpret_cast<size_t>(exp_float_consts));
+    vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]);
+    mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
+    fild(dword[param_attr]);
+    fstp(dword[reg_tmp]);
+    vmovss(xmm_t(0), ptr[reg_tmp]);
+    if (type_ == SeqPoolType::kSqrt) {
+      vsqrtps(xmm_t(0), xmm_t(0));
+    }
+    vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
+    vmovss(ptr[reg_tmp], xmm_t(1));
+  }
+  const int group_len = max_num_regs * block * sizeof(float);
+  for (int g = 0; g < num_groups; ++g) {
+    pool_height<ymm_t>(g * group_len, block, max_num_regs);
+  }
+  if (rest_num_regs > 0) {
+    pool_height<ymm_t>(num_groups * group_len, block, rest_num_regs);
+  }
+  // part of rest_w * height
+  const int rest = w_ % block;
+  pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs);
+  ret();
+}
+
+class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
+ public:
+  bool UseMe(const seq_pool_attr_t& attr) const override {
+    return platform::MayIUse(platform::avx);
+  }
+  size_t CodeSize(const seq_pool_attr_t& attr) const override {
+    return 96 +
+           ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
+                4 /* load, mul and save */ +
+            256) *
+               8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const seq_pool_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.w, 0);
+    PADDLE_ENFORCE_GT(attr.h, 0);
+    return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@ -0,0 +1,214 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class SeqPoolJitCode : public JitCode {
+ public:
+  explicit SeqPoolJitCode(const seq_pool_attr_t& attr,
+                          size_t code_size = 256 * 1024,
+                          void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
+    if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
+          type_ == SeqPoolType::kSqrt)) {
+      LOG(FATAL) << "Only support sum pool yet ";
+    }
+    fp_h_[0] = 1.f;
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "SeqPoolJitCode";
+    if (type_ == SeqPoolType::kSum) {
+      base += "_Sum";
+    } else if (type_ == SeqPoolType::kAvg) {
+      base += "_Avg";
+    } else if (type_ == SeqPoolType::kSqrt) {
+      base += "_Sqrt";
+    }
+    base += ("_W" + std::to_string(w_));
+    return base.c_str();
+  }
+  void genCode() override;
+
+ protected:
+  template <typename JMM>
+  void pool_height(int w_offset, int block, int max_num_regs) {
+    int offset = w_offset;
+    for (int i = 0; i < max_num_regs; ++i) {
+      vmovups(JMM(i), ptr[param_src + offset]);
+      offset += sizeof(float) * block;
+    }
+    cmp(reg32_int_h, 1);
+    Label l_next_h, l_h_done;
+    jle(l_h_done, T_NEAR);
+    mov(reg_h_i, 1);
+    mov(reg_tmp, param_src);
+    add(reg_tmp, w_ * sizeof(float) + w_offset);
+    L(l_next_h);
+    {
+      mov(reg_ptr_src_i, reg_tmp);
+      for (int i = 0; i < max_num_regs; ++i) {
+        vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]);
+        // sum anyway
+        vaddps(JMM(i), JMM(i), JMM(i + max_num_regs));
+        add(reg_ptr_src_i, sizeof(float) * block);
+      }
+      inc(reg_h_i);
+      add(reg_tmp, w_ * sizeof(float));
+      cmp(reg_h_i, reg32_int_h);
+      jl(l_next_h, T_NEAR);
+    }
+    L(l_h_done);
+    // save right now
+    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
+      vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]);
+    }
+    offset = w_offset;
+    for (int i = 0; i < max_num_regs; ++i) {
+      if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+        vmulps(JMM(i), JMM(i), JMM(max_num_regs));
+      }
+      vmovups(ptr[param_dst + offset], JMM(i));
+      offset += sizeof(float) * block;
+    }
+  }
+
+  void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) {
+    const int rest_used_num_regs = load_rest(rest, w_offset, 0);
+    const bool has_block4 = rest / 4 > 0;
+    const bool has_block2 = (rest % 4) / 2 > 0;
+    const bool has_block1 = (rest % 2) == 1;
+    cmp(reg32_int_h, 1);
+    Label l_next_h, l_h_done;
+    jle(l_h_done, T_NEAR);
+    mov(reg_h_i, 1);
+    mov(reg_tmp, param_src);
+    add(reg_tmp, w_ * sizeof(float) + w_offset);
+    L(l_next_h);
+    {
+      int reg_idx = 0;
+      mov(reg_ptr_src_i, reg_tmp);
+      if (has_block4) {
+        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
+        add(reg_ptr_src_i, sizeof(float) * 4);
+        reg_idx++;
+      }
+      if (has_block2) {
+        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
+        add(reg_ptr_src_i, sizeof(float) * 2);
+        reg_idx++;
+      }
+      if (has_block1) {
+        vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
+        reg_idx++;
+      }
+      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
+                        "All heights should use same regs");
+      for (int i = 0; i < reg_idx; ++i) {
+        vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
+      }
+      inc(reg_h_i);
+      add(reg_tmp, w_ * sizeof(float));
+      cmp(reg_h_i, reg32_int_h);
+      jl(l_next_h, T_NEAR);
+    }
+    L(l_h_done);
+    // save right now
+    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
+      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
+      vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]);
+      for (int i = 0; i < rest_used_num_regs; ++i) {
+        vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs));
+      }
+    }
+    save_rest(rest, w_offset);
+  }
+
+  // return the number of used regs, use start from reg 0
+  int load_rest(int rest, int w_offset, const int num_shift_regs,
+                const int reg_start = 0) {
+    const bool has_block4 = rest / 4 > 0;
+    const bool has_block2 = (rest % 4) / 2 > 0;
+    const bool has_block1 = (rest % 2) == 1;
+    int reg_idx = reg_start;
+    if (has_block4) {
+      vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
+      w_offset += sizeof(float) * 4;
+      reg_idx++;
+    }
+    if (has_block2) {
+      vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
+      w_offset += sizeof(float) * 2;
+      reg_idx++;
+    }
+    if (has_block1) {
+      vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
+      reg_idx++;
+    }
+    return reg_idx;
+  }
+
+  // use reg start from 0
+  void save_rest(int rest, int w_offset, int reg_start = 0) {
+    const bool has_block4 = rest / 4 > 0;
+    const bool has_block2 = (rest % 4) / 2 > 0;
+    const bool has_block1 = (rest % 2) == 1;
+    int reg_idx = reg_start;
+    if (has_block4) {
+      vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx));
+      w_offset += sizeof(float) * 4;
+      reg_idx++;
+    }
+    if (has_block2) {
+      vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx));
+      w_offset += sizeof(float) * 2;
+      reg_idx++;
+    }
+    if (has_block1) {
+      vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx));
+    }
+  }
+
+ private:
+  float ALIGN32_BEG fp_h_[1] ALIGN32_END;
+  int w_;
+  SeqPoolType type_;
+  reg64_t param_src{abi_param1};
+  reg64_t param_dst{abi_param2};
+  reg64_t param_attr{abi_param3};
+  reg64_t reg_tmp{rax};
+
+  reg32_t reg32_int_h{r8d};
+  reg32_t reg32_fp_h{r9d};
+
+  reg64_t reg_h_i{r10};
+  reg64_t reg_ptr_src_i{r11};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@ -26,6 +26,7 @@ namespace jit {

 const char* to_string(KernelType kt) {
  switch (kt) {
+    ONE_CASE(kNone);
    ONE_CASE(kVMul);
    ONE_CASE(kVAdd);
    ONE_CASE(kVAddRelu);
@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kCRFDecoding);
    ONE_CASE(kLayerNorm);
    ONE_CASE(kNCHW16CMulNC);
+    ONE_CASE(kSeqPool);
    default:
      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
      return "NOT JITKernel";
  }
  return nullptr;
 }
+
+const char* to_string(SeqPoolType tp) {
+  switch (tp) {
+    ONE_CASE(kNonePoolType);
+    ONE_CASE(kSum);
+    ONE_CASE(kAvg);
+    ONE_CASE(kSqrt);
+    default:
+      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
+      return "NOT PoolType";
+  }
+  return nullptr;
+}
 #undef ONE_CASE

 KernelType to_kerneltype(const std::string& act) {
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
 }

 const char* to_string(KernelType kt);
+const char* to_string(SeqPoolType kt);

 KernelType to_kerneltype(const std::string& act);

@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
     << "],act_cand[" << to_string(attr.act_cand) << "]";
  return os;
 }
+inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
+  os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
+     << to_string(attr.type) << "]";
+  return os;
+}

 }  // namespace jit
 }  // namespace operators
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@ -41,8 +41,16 @@ typedef enum {
  kCRFDecoding,
  kLayerNorm,
  kNCHW16CMulNC,
+  kSeqPool,
 } KernelType;

+typedef enum {
+  kNonePoolType = 0,
+  kSum = 1,
+  kAvg,
+  kSqrt,
+} SeqPoolType;
+
 template <typename T>
 struct XYZNTuples {
  typedef T data_type;
@ -112,6 +120,21 @@ struct GRUTuples {
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };

+typedef struct seq_pool_attr_s {
+  int h, w;  // h should always be the first one
+  SeqPoolType type;
+  seq_pool_attr_s() = default;
+  explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
+      : h(height), w(width), type(pool_type) {}
+} seq_pool_attr_t;
+
+template <typename T>
+struct SeqPoolTuples {
+  typedef T data_type;
+  typedef seq_pool_attr_t attr_type;
+  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
+};
+
 template <typename T>
 struct CRFDecodingTuples {
  typedef T data_type;
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
         (static_cast<int>(attr.act_cand) << act_type_shift);
 }

+template <>
+size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
+  size_t key = attr.w;
+  constexpr int pool_type_shift = 3;
+  return (key << pool_type_shift) + static_cast<int>(attr.type);
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
--- a/Show More
+++ b/Show More