Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-dist-sparse-decay

test=develop
7 years ago · d0e3b24002
parent c3b9edf958 223cc89f0b
commit d0e3b24002
47 changed files with 1239 additions and 197 deletions
--- a/cmake/FindJeMalloc.cmake
+++ b/cmake/FindJeMalloc.cmake
@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
 mark_as_advanced(
  JEMALLOC_LIBRARIES
  JEMALLOC_INCLUDE_DIR)
 if (JEMALLOC_FOUND)
  add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
  set_target_properties(jemalloc::jemalloc PROPERTIES
    IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
    INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
 endif()
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -2,9 +2,11 @@ if(NOT WITH_GPU)
    return()
 endif()
-set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75")
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
 set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
 ######################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@ -155,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
  # warning for now.
  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
  add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
 elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()
 include_directories(${CUDA_INCLUDE_DIRS})
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -23,11 +23,8 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
-if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
+set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-    message(STATUS "use pre defined download url")
+set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -55,7 +55,7 @@ ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
-    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_REPOSITORY      "https://github.com/intel/mkl-dnn.git"
    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
 IF(APPLE)
    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
    return()
 ENDIF()
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-if(WIN32)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 SET(TIME_VERSION "2019.0.1.20181227")
 IF(WIN32)
    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-else()
+ELSE()  
    SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
-endif()
+ENDIF()
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
    MESSAGE(STATUS "use pre defined download url")
    if(WIN32)
        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
        SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    elseif(APPLE)
        SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    else()
        SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    ENDIF()
 endif()
 SET(MKLML_PROJECT       "extern_mklml")
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
  endif()
  if (WITH_JEMALLOC)
-    target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES})
+    target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
  endif()
 endfunction()
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass
-        memory_optimize_pass)
+        memory_optimize_pass lock_free_optimize_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(lock_free_optimize_pass);
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@ -0,0 +1,130 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
 #define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
 #include <string>
 #include <vector>
 #include <boost/algorithm/string/predicate.hpp>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 class Node;
 /*
 * Remove the sum op of all gradients of the backward op.
 * And remove the dependecies of the optimizer related to the
 * same backward op.
 *
 * Before this pass:
 *
 * forward_op1 forward_op2
 *     |            |
 *  grad_op1    grad_op2
 *        \      /
 *          \  /
 *         sum_op
 *           |
 *         sgd_op
 *
 * After this pass:
 * forward_op1 forward_op2
 *     |            |
 *  grad_op1    grad_op2
 *     |            |
 *  sgd_op1      sgd_op2
 *
 * sgd_op1 and sgd_op2 will update the same weight which holds the same
 * memory, so we could benefits from the acceleration
 */
 class LockFreeOptimizePass : public Pass {
 public:
  virtual ~LockFreeOptimizePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
 private:
  // Create a new sgd node via current optimizer node
  ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
                             ir::Node* backward_node, ir::Node* grad_sum_node,
                             ir::Node* optimize_node) const;
  // Replace the input weight's optimizers
  void ReplaceUpstreamNode(ir::Node* upstream_node,
                           ir::Node* old_optimizer_node,
                           ir::Node* new_optimizer_node) const;
  // Replace the output weight's optimizers
  void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
                                ir::Node* new_optimizer_node) const;
  // Find all weight variables in graph
  bool FindAllWeightVars(ir::Graph* graph) const;
  // Find the forward_op node via the backward_op node
  ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
                                       ir::Node* backward_node) const;
  std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
                                           ir::Node* downstream_node) const;
  inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
    PADDLE_ENFORCE(node);
    return node->NodeType() == Node::Type::kOperation && node->Name() == name;
  }
  inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
    PADDLE_ENFORCE(node);
    return node->NodeType() == Node::Type::kVariable && node->Name() == name;
  }
  inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
    PADDLE_ENFORCE(node);
    return node->NodeType() == Node::Type::kVariable &&
           boost::algorithm::ends_with(node->Name(), name);
  }
  inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
    PADDLE_ENFORCE(node);
    return node->NodeType() == Node::Type::kVariable &&
           node->Name().find(name) != std::string::npos;
  }
  inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
    PADDLE_ENFORCE(ctrl_dep_node);
    PADDLE_ENFORCE(node);
    return IsControlDepVar(*ctrl_dep_node) &&
           ctrl_dep_node->inputs.size() >= 1u &&
           ctrl_dep_node->inputs[0] == node;
  }
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 #endif  // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
 }
 Variable* Scope::Var(std::string* name) {
-  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  SCOPE_VARS_WRITER_LOCK
  auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
                  std::to_string(vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
 }  // namespace detail
-const std::type_index &ToTypeIndex(int var_id) {
+const std::type_index &VarTraitIdToTypeIndex(int var_id) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
 }
-const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+const char *ToTypeName(int var_id) {
  return VarTraitIdToTypeIndex(var_id).name();
 }
-int ToTypeId(const std::type_index &type) {
+int TypeIndexToVarTraitId(const std::type_index &type) {
  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
 }
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@ -66,8 +66,8 @@ namespace paddle {
 namespace framework {
 const char *ToTypeName(int var_id);
-const std::type_index &ToTypeIndex(int var_id);
+const std::type_index &VarTraitIdToTypeIndex(int var_id);
-int ToTypeId(const std::type_index &type);
+int TypeIndexToVarTraitId(const std::type_index &type);
 namespace detail {
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@ -45,10 +45,11 @@ struct TypeIndexChecker {
    constexpr auto kId = VarTypeTrait<Type>::kId;
    std::type_index actual_type(typeid(Type));
    EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
-    EXPECT_EQ(ToTypeIndex(kId), actual_type);
+    EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
-    EXPECT_EQ(ToTypeId(actual_type), kId);
+    EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
-    EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
+    EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
-    EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
+              actual_type);
    EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);
    EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
    EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
       i++) {
    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
              << " result: " << result[i];
-    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+    EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
-                   result[i]);
+                1e-3);
  }
 }
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
        ir_graph_build_pass
        ir_analysis_pass
        analysis_passes
        subgraph_detector
        CACHE INTERNAL "")
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@ -190,6 +190,26 @@ void BenchGRUKernel() {
  }
 }
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchSeqPoolKernel() {
  std::vector<jit::SeqPoolType> pool_types = {
      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
  for (auto type : pool_types) {
    for (int w : TestSizes()) {
      jit::seq_pool_attr_t attr(w, type);
      for (int h : TestSizes()) {
        attr.h = h;
        std::vector<T> x(h * w), y(w);
        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
        const T* x_data = x.data();
        T* y_data = y.data();
        BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
                                                            y_data, &attr);
      }
    }
  }
 }
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
  // seq pool function
  BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
 }
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
 USE_JITKERNEL_GEN(kGRUHtPart1)
 USE_JITKERNEL_GEN(kGRUHtPart2)
 USE_JITKERNEL_GEN(kNCHW16CMulNC)
 USE_JITKERNEL_GEN(kSeqPool)
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@ -0,0 +1,85 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #include "paddle/fluid/operators/jit/gen/seqpool.h"
 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 namespace paddle {
 namespace operators {
 namespace jit {
 namespace gen {
 void SeqPoolJitCode::genCode() {
  constexpr int block = YMM_FLOAT_BLOCK;
  constexpr int max_num_regs = 8;
  const int num_block = w_ / block;
  const int num_groups = num_block / max_num_regs;
  int rest_num_regs = num_block % max_num_regs;
  mov(reg32_int_h, dword[param_attr]);
  if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
    mov(reg_tmp, reinterpret_cast<size_t>(exp_float_consts));
    vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]);
    mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
    fild(dword[param_attr]);
    fstp(dword[reg_tmp]);
    vmovss(xmm_t(0), ptr[reg_tmp]);
    if (type_ == SeqPoolType::kSqrt) {
      vsqrtps(xmm_t(0), xmm_t(0));
    }
    vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
    vmovss(ptr[reg_tmp], xmm_t(1));
  }
  const int group_len = max_num_regs * block * sizeof(float);
  for (int g = 0; g < num_groups; ++g) {
    pool_height<ymm_t>(g * group_len, block, max_num_regs);
  }
  if (rest_num_regs > 0) {
    pool_height<ymm_t>(num_groups * group_len, block, rest_num_regs);
  }
  // part of rest_w * height
  const int rest = w_ % block;
  pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs);
  ret();
 }
 class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
 public:
  bool UseMe(const seq_pool_attr_t& attr) const override {
    return platform::MayIUse(platform::avx);
  }
  size_t CodeSize(const seq_pool_attr_t& attr) const override {
    return 96 +
           ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
                4 /* load, mul and save */ +
            256) *
               8;
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const seq_pool_attr_t& attr) const override {
    PADDLE_ENFORCE_GT(attr.w, 0);
    PADDLE_ENFORCE_GT(attr.h, 0);
    return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
  }
 };
 }  // namespace gen
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
 namespace gen = paddle::operators::jit::gen;
 REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@ -0,0 +1,214 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #pragma once
 #include <string>
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace operators {
 namespace jit {
 namespace gen {
 class SeqPoolJitCode : public JitCode {
 public:
  explicit SeqPoolJitCode(const seq_pool_attr_t& attr,
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
      : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
    if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
          type_ == SeqPoolType::kSqrt)) {
      LOG(FATAL) << "Only support sum pool yet ";
    }
    fp_h_[0] = 1.f;
    this->genCode();
  }
  virtual const char* name() const {
    std::string base = "SeqPoolJitCode";
    if (type_ == SeqPoolType::kSum) {
      base += "_Sum";
    } else if (type_ == SeqPoolType::kAvg) {
      base += "_Avg";
    } else if (type_ == SeqPoolType::kSqrt) {
      base += "_Sqrt";
    }
    base += ("_W" + std::to_string(w_));
    return base.c_str();
  }
  void genCode() override;
 protected:
  template <typename JMM>
  void pool_height(int w_offset, int block, int max_num_regs) {
    int offset = w_offset;
    for (int i = 0; i < max_num_regs; ++i) {
      vmovups(JMM(i), ptr[param_src + offset]);
      offset += sizeof(float) * block;
    }
    cmp(reg32_int_h, 1);
    Label l_next_h, l_h_done;
    jle(l_h_done, T_NEAR);
    mov(reg_h_i, 1);
    mov(reg_tmp, param_src);
    add(reg_tmp, w_ * sizeof(float) + w_offset);
    L(l_next_h);
    {
      mov(reg_ptr_src_i, reg_tmp);
      for (int i = 0; i < max_num_regs; ++i) {
        vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]);
        // sum anyway
        vaddps(JMM(i), JMM(i), JMM(i + max_num_regs));
        add(reg_ptr_src_i, sizeof(float) * block);
      }
      inc(reg_h_i);
      add(reg_tmp, w_ * sizeof(float));
      cmp(reg_h_i, reg32_int_h);
      jl(l_next_h, T_NEAR);
    }
    L(l_h_done);
    // save right now
    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
      vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]);
    }
    offset = w_offset;
    for (int i = 0; i < max_num_regs; ++i) {
      if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
        vmulps(JMM(i), JMM(i), JMM(max_num_regs));
      }
      vmovups(ptr[param_dst + offset], JMM(i));
      offset += sizeof(float) * block;
    }
  }
  void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) {
    const int rest_used_num_regs = load_rest(rest, w_offset, 0);
    const bool has_block4 = rest / 4 > 0;
    const bool has_block2 = (rest % 4) / 2 > 0;
    const bool has_block1 = (rest % 2) == 1;
    cmp(reg32_int_h, 1);
    Label l_next_h, l_h_done;
    jle(l_h_done, T_NEAR);
    mov(reg_h_i, 1);
    mov(reg_tmp, param_src);
    add(reg_tmp, w_ * sizeof(float) + w_offset);
    L(l_next_h);
    {
      int reg_idx = 0;
      mov(reg_ptr_src_i, reg_tmp);
      if (has_block4) {
        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
        add(reg_ptr_src_i, sizeof(float) * 4);
        reg_idx++;
      }
      if (has_block2) {
        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
        add(reg_ptr_src_i, sizeof(float) * 2);
        reg_idx++;
      }
      if (has_block1) {
        vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
        reg_idx++;
      }
      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
                        "All heights should use same regs");
      for (int i = 0; i < reg_idx; ++i) {
        vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
      }
      inc(reg_h_i);
      add(reg_tmp, w_ * sizeof(float));
      cmp(reg_h_i, reg32_int_h);
      jl(l_next_h, T_NEAR);
    }
    L(l_h_done);
    // save right now
    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
      vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]);
      for (int i = 0; i < rest_used_num_regs; ++i) {
        vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs));
      }
    }
    save_rest(rest, w_offset);
  }
  // return the number of used regs, use start from reg 0
  int load_rest(int rest, int w_offset, const int num_shift_regs,
                const int reg_start = 0) {
    const bool has_block4 = rest / 4 > 0;
    const bool has_block2 = (rest % 4) / 2 > 0;
    const bool has_block1 = (rest % 2) == 1;
    int reg_idx = reg_start;
    if (has_block4) {
      vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
      w_offset += sizeof(float) * 4;
      reg_idx++;
    }
    if (has_block2) {
      vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
      w_offset += sizeof(float) * 2;
      reg_idx++;
    }
    if (has_block1) {
      vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
      reg_idx++;
    }
    return reg_idx;
  }
  // use reg start from 0
  void save_rest(int rest, int w_offset, int reg_start = 0) {
    const bool has_block4 = rest / 4 > 0;
    const bool has_block2 = (rest % 4) / 2 > 0;
    const bool has_block1 = (rest % 2) == 1;
    int reg_idx = reg_start;
    if (has_block4) {
      vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx));
      w_offset += sizeof(float) * 4;
      reg_idx++;
    }
    if (has_block2) {
      vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx));
      w_offset += sizeof(float) * 2;
      reg_idx++;
    }
    if (has_block1) {
      vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx));
    }
  }
 private:
  float ALIGN32_BEG fp_h_[1] ALIGN32_END;
  int w_;
  SeqPoolType type_;
  reg64_t param_src{abi_param1};
  reg64_t param_dst{abi_param2};
  reg64_t param_attr{abi_param3};
  reg64_t reg_tmp{rax};
  reg32_t reg32_int_h{r8d};
  reg32_t reg32_fp_h{r9d};
  reg64_t reg_h_i{r10};
  reg64_t reg_ptr_src_i{r11};
 };
 }  // namespace gen
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@ -26,6 +26,7 @@ namespace jit {
 const char* to_string(KernelType kt) {
  switch (kt) {
    ONE_CASE(kNone);
    ONE_CASE(kVMul);
    ONE_CASE(kVAdd);
    ONE_CASE(kVAddRelu);
@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kCRFDecoding);
    ONE_CASE(kLayerNorm);
    ONE_CASE(kNCHW16CMulNC);
    ONE_CASE(kSeqPool);
    default:
      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
      return "NOT JITKernel";
  }
  return nullptr;
 }
 const char* to_string(SeqPoolType tp) {
  switch (tp) {
    ONE_CASE(kNonePoolType);
    ONE_CASE(kSum);
    ONE_CASE(kAvg);
    ONE_CASE(kSqrt);
    default:
      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
      return "NOT PoolType";
  }
  return nullptr;
 }
 #undef ONE_CASE
 KernelType to_kerneltype(const std::string& act) {
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
 }
 const char* to_string(KernelType kt);
 const char* to_string(SeqPoolType kt);
 KernelType to_kerneltype(const std::string& act);
@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
     << "],act_cand[" << to_string(attr.act_cand) << "]";
  return os;
 }
 inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
  os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
     << to_string(attr.type) << "]";
  return os;
 }
 }  // namespace jit
 }  // namespace operators
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@ -41,8 +41,16 @@ typedef enum {
  kCRFDecoding,
  kLayerNorm,
  kNCHW16CMulNC,
  kSeqPool,
 } KernelType;
 typedef enum {
  kNonePoolType = 0,
  kSum = 1,
  kAvg,
  kSqrt,
 } SeqPoolType;
 template <typename T>
 struct XYZNTuples {
  typedef T data_type;
@ -112,6 +120,21 @@ struct GRUTuples {
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };
 typedef struct seq_pool_attr_s {
  int h, w;  // h should always be the first one
  SeqPoolType type;
  seq_pool_attr_s() = default;
  explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
      : h(height), w(width), type(pool_type) {}
 } seq_pool_attr_t;
 template <typename T>
 struct SeqPoolTuples {
  typedef T data_type;
  typedef seq_pool_attr_t attr_type;
  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
 template <typename T>
 struct CRFDecodingTuples {
  typedef T data_type;
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
         (static_cast<int>(attr.act_cand) << act_type_shift);
 }
 template <>
 size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
  size_t key = attr.w;
  constexpr int pool_type_shift = 3;
  return (key << pool_type_shift) + static_cast<int>(attr.type);
 }
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
--- a/Show More
+++ b/Show More