Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-dist-sparse-decay

test=develop
revert-15207-remove_op_handle_lock_and_fix_var
Qiao Longfei 6 years ago
commit d0e3b24002

@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
mark_as_advanced( mark_as_advanced(
JEMALLOC_LIBRARIES JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR) JEMALLOC_INCLUDE_DIR)
if (JEMALLOC_FOUND)
add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
set_target_properties(jemalloc::jemalloc PROPERTIES
IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
endif()

@ -2,9 +2,11 @@ if(NOT WITH_GPU)
return() return()
endif() endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75") set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61") set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
###################################################################################### ######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled) # A function for automatic detection of GPUs installed (if autodetection is enabled)
@ -155,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# warning for now. # warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})

@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost")
# checked that the devtools package of CentOS 6 installs boost 1.41.0. # checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here. # So we use 1.41.0 here.
set(BOOST_VER "1.41.0") set(BOOST_VER "1.41.0")
if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
message(STATUS "use pre defined download url") set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
endif()
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")

@ -55,7 +55,7 @@ ExternalProject_Add(
${MKLDNN_PROJECT} ${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git"
GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""

@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
return() return()
ENDIF(NOT ${WITH_MKLML}) ENDIF(NOT ${WITH_MKLML})
IF(APPLE)
MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
if(WIN32) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
SET(TIME_VERSION "2019.0.1.20181227")
IF(WIN32)
SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
else() ELSE()
SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
endif() ENDIF()
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
MESSAGE(STATUS "use pre defined download url")
if(WIN32)
SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
elseif(APPLE)
SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
else()
SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
ENDIF()
endif()
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")

@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
endif() endif()
if (WITH_JEMALLOC) if (WITH_JEMALLOC)
target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES}) target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
endif() endif()
endfunction() endfunction()

@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
memory_optimize_pass) memory_optimize_pass lock_free_optimize_pass)

@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
USE_PASS(sequential_execution_pass); USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass); USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(lock_free_optimize_pass);

@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(lock_free_optimize_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,130 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#include <string>
#include <vector>
#include <boost/algorithm/string/predicate.hpp>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
/*
* Remove the sum op of all gradients of the backward op.
* And remove the dependecies of the optimizer related to the
* same backward op.
*
* Before this pass:
*
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* \ /
* \ /
* sum_op
* |
* sgd_op
*
* After this pass:
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* | |
* sgd_op1 sgd_op2
*
* sgd_op1 and sgd_op2 will update the same weight which holds the same
* memory, so we could benefits from the acceleration
*/
class LockFreeOptimizePass : public Pass {
public:
virtual ~LockFreeOptimizePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
private:
// Create a new sgd node via current optimizer node
ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
ir::Node* backward_node, ir::Node* grad_sum_node,
ir::Node* optimize_node) const;
// Replace the input weight's optimizers
void ReplaceUpstreamNode(ir::Node* upstream_node,
ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const;
// Replace the output weight's optimizers
void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const;
// Find all weight variables in graph
bool FindAllWeightVars(ir::Graph* graph) const;
// Find the forward_op node via the backward_op node
ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
ir::Node* backward_node) const;
std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
ir::Node* downstream_node) const;
inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kOperation && node->Name() == name;
}
inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable && node->Name() == name;
}
inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable &&
boost::algorithm::ends_with(node->Name(), name);
}
inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable &&
node->Name().find(name) != std::string::npos;
}
inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
PADDLE_ENFORCE(ctrl_dep_node);
PADDLE_ENFORCE(node);
return IsControlDepVar(*ctrl_dep_node) &&
ctrl_dep_node->inputs.size() >= 1u &&
ctrl_dep_node->inputs[0] == node;
}
};
} // namespace ir
} // namespace framework
} // namespace paddle
#endif // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_

@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); SCOPE_VARS_WRITER_LOCK
auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
std::to_string(vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
} }
SCOPE_VARS_WRITER_LOCK
return VarInternal(new_name); return VarInternal(new_name);
} }

@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
} // namespace detail } // namespace detail
const std::type_index &ToTypeIndex(int var_id) { const std::type_index &VarTraitIdToTypeIndex(int var_id) {
return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
} }
const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } const char *ToTypeName(int var_id) {
return VarTraitIdToTypeIndex(var_id).name();
}
int ToTypeId(const std::type_index &type) { int TypeIndexToVarTraitId(const std::type_index &type) {
return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
} }

@ -66,8 +66,8 @@ namespace paddle {
namespace framework { namespace framework {
const char *ToTypeName(int var_id); const char *ToTypeName(int var_id);
const std::type_index &ToTypeIndex(int var_id); const std::type_index &VarTraitIdToTypeIndex(int var_id);
int ToTypeId(const std::type_index &type); int TypeIndexToVarTraitId(const std::type_index &type);
namespace detail { namespace detail {

@ -45,10 +45,11 @@ struct TypeIndexChecker {
constexpr auto kId = VarTypeTrait<Type>::kId; constexpr auto kId = VarTypeTrait<Type>::kId;
std::type_index actual_type(typeid(Type)); std::type_index actual_type(typeid(Type));
EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
EXPECT_EQ(ToTypeIndex(kId), actual_type); EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
EXPECT_EQ(ToTypeId(actual_type), kId); EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); actual_type);
EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);
EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT
EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT

@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
i++) { i++) {
LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i] LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
<< " result: " << result[i]; << " result: " << result[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i], EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
result[i]); 1e-3);
} }
} }

@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
ir_graph_build_pass ir_graph_build_pass
ir_analysis_pass ir_analysis_pass
analysis_passes analysis_passes
subgraph_detector
CACHE INTERNAL "") CACHE INTERNAL "")

@ -190,6 +190,26 @@ void BenchGRUKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void BenchSeqPoolKernel() {
std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
for (auto type : pool_types) {
for (int w : TestSizes()) {
jit::seq_pool_attr_t attr(w, type);
for (int h : TestSizes()) {
attr.h = h;
std::vector<T> x(h * w), y(w);
RandomVec<T>(h * w, x.data(), -2.f, 2.f);
const T* x_data = x.data();
T* y_data = y.data();
BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
y_data, &attr);
}
}
}
}
// Benchmark all jit kernels including jitcode, mkl and refer. // Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...] // To use this tool, run command: ./benchmark [options...]
// Options: // Options:
@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>(); BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>(); BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
// seq pool function
BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
} }

@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
USE_JITKERNEL_GEN(kGRUHtPart1) USE_JITKERNEL_GEN(kGRUHtPart1)
USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kGRUHtPart2)
USE_JITKERNEL_GEN(kNCHW16CMulNC) USE_JITKERNEL_GEN(kNCHW16CMulNC)
USE_JITKERNEL_GEN(kSeqPool)

@ -0,0 +1,85 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/seqpool.h"
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void SeqPoolJitCode::genCode() {
constexpr int block = YMM_FLOAT_BLOCK;
constexpr int max_num_regs = 8;
const int num_block = w_ / block;
const int num_groups = num_block / max_num_regs;
int rest_num_regs = num_block % max_num_regs;
mov(reg32_int_h, dword[param_attr]);
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
mov(reg_tmp, reinterpret_cast<size_t>(exp_float_consts));
vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]);
mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
fild(dword[param_attr]);
fstp(dword[reg_tmp]);
vmovss(xmm_t(0), ptr[reg_tmp]);
if (type_ == SeqPoolType::kSqrt) {
vsqrtps(xmm_t(0), xmm_t(0));
}
vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
vmovss(ptr[reg_tmp], xmm_t(1));
}
const int group_len = max_num_regs * block * sizeof(float);
for (int g = 0; g < num_groups; ++g) {
pool_height<ymm_t>(g * group_len, block, max_num_regs);
}
if (rest_num_regs > 0) {
pool_height<ymm_t>(num_groups * group_len, block, rest_num_regs);
}
// part of rest_w * height
const int rest = w_ % block;
pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs);
ret();
}
class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
public:
bool UseMe(const seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx);
}
size_t CodeSize(const seq_pool_attr_t& attr) const override {
return 96 +
((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
4 /* load, mul and save */ +
256) *
8;
}
std::unique_ptr<GenBase> CreateJitCode(
const seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.w, 0);
PADDLE_ENFORCE_GT(attr.h, 0);
return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);

@ -0,0 +1,214 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class SeqPoolJitCode : public JitCode {
public:
explicit SeqPoolJitCode(const seq_pool_attr_t& attr,
size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
type_ == SeqPoolType::kSqrt)) {
LOG(FATAL) << "Only support sum pool yet ";
}
fp_h_[0] = 1.f;
this->genCode();
}
virtual const char* name() const {
std::string base = "SeqPoolJitCode";
if (type_ == SeqPoolType::kSum) {
base += "_Sum";
} else if (type_ == SeqPoolType::kAvg) {
base += "_Avg";
} else if (type_ == SeqPoolType::kSqrt) {
base += "_Sqrt";
}
base += ("_W" + std::to_string(w_));
return base.c_str();
}
void genCode() override;
protected:
template <typename JMM>
void pool_height(int w_offset, int block, int max_num_regs) {
int offset = w_offset;
for (int i = 0; i < max_num_regs; ++i) {
vmovups(JMM(i), ptr[param_src + offset]);
offset += sizeof(float) * block;
}
cmp(reg32_int_h, 1);
Label l_next_h, l_h_done;
jle(l_h_done, T_NEAR);
mov(reg_h_i, 1);
mov(reg_tmp, param_src);
add(reg_tmp, w_ * sizeof(float) + w_offset);
L(l_next_h);
{
mov(reg_ptr_src_i, reg_tmp);
for (int i = 0; i < max_num_regs; ++i) {
vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]);
// sum anyway
vaddps(JMM(i), JMM(i), JMM(i + max_num_regs));
add(reg_ptr_src_i, sizeof(float) * block);
}
inc(reg_h_i);
add(reg_tmp, w_ * sizeof(float));
cmp(reg_h_i, reg32_int_h);
jl(l_next_h, T_NEAR);
}
L(l_h_done);
// save right now
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]);
}
offset = w_offset;
for (int i = 0; i < max_num_regs; ++i) {
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
vmulps(JMM(i), JMM(i), JMM(max_num_regs));
}
vmovups(ptr[param_dst + offset], JMM(i));
offset += sizeof(float) * block;
}
}
void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) {
const int rest_used_num_regs = load_rest(rest, w_offset, 0);
const bool has_block4 = rest / 4 > 0;
const bool has_block2 = (rest % 4) / 2 > 0;
const bool has_block1 = (rest % 2) == 1;
cmp(reg32_int_h, 1);
Label l_next_h, l_h_done;
jle(l_h_done, T_NEAR);
mov(reg_h_i, 1);
mov(reg_tmp, param_src);
add(reg_tmp, w_ * sizeof(float) + w_offset);
L(l_next_h);
{
int reg_idx = 0;
mov(reg_ptr_src_i, reg_tmp);
if (has_block4) {
vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
add(reg_ptr_src_i, sizeof(float) * 4);
reg_idx++;
}
if (has_block2) {
vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
add(reg_ptr_src_i, sizeof(float) * 2);
reg_idx++;
}
if (has_block1) {
vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
reg_idx++;
}
PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
"All heights should use same regs");
for (int i = 0; i < reg_idx; ++i) {
vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
}
inc(reg_h_i);
add(reg_tmp, w_ * sizeof(float));
cmp(reg_h_i, reg32_int_h);
jl(l_next_h, T_NEAR);
}
L(l_h_done);
// save right now
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]);
for (int i = 0; i < rest_used_num_regs; ++i) {
vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs));
}
}
save_rest(rest, w_offset);
}
// return the number of used regs, use start from reg 0
int load_rest(int rest, int w_offset, const int num_shift_regs,
const int reg_start = 0) {
const bool has_block4 = rest / 4 > 0;
const bool has_block2 = (rest % 4) / 2 > 0;
const bool has_block1 = (rest % 2) == 1;
int reg_idx = reg_start;
if (has_block4) {
vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
w_offset += sizeof(float) * 4;
reg_idx++;
}
if (has_block2) {
vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
w_offset += sizeof(float) * 2;
reg_idx++;
}
if (has_block1) {
vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
reg_idx++;
}
return reg_idx;
}
// use reg start from 0
void save_rest(int rest, int w_offset, int reg_start = 0) {
const bool has_block4 = rest / 4 > 0;
const bool has_block2 = (rest % 4) / 2 > 0;
const bool has_block1 = (rest % 2) == 1;
int reg_idx = reg_start;
if (has_block4) {
vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx));
w_offset += sizeof(float) * 4;
reg_idx++;
}
if (has_block2) {
vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx));
w_offset += sizeof(float) * 2;
reg_idx++;
}
if (has_block1) {
vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx));
}
}
private:
float ALIGN32_BEG fp_h_[1] ALIGN32_END;
int w_;
SeqPoolType type_;
reg64_t param_src{abi_param1};
reg64_t param_dst{abi_param2};
reg64_t param_attr{abi_param3};
reg64_t reg_tmp{rax};
reg32_t reg32_int_h{r8d};
reg32_t reg32_fp_h{r9d};
reg64_t reg_h_i{r10};
reg64_t reg_ptr_src_i{r11};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle

@ -26,6 +26,7 @@ namespace jit {
const char* to_string(KernelType kt) { const char* to_string(KernelType kt) {
switch (kt) { switch (kt) {
ONE_CASE(kNone);
ONE_CASE(kVMul); ONE_CASE(kVMul);
ONE_CASE(kVAdd); ONE_CASE(kVAdd);
ONE_CASE(kVAddRelu); ONE_CASE(kVAddRelu);
@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
ONE_CASE(kCRFDecoding); ONE_CASE(kCRFDecoding);
ONE_CASE(kLayerNorm); ONE_CASE(kLayerNorm);
ONE_CASE(kNCHW16CMulNC); ONE_CASE(kNCHW16CMulNC);
ONE_CASE(kSeqPool);
default: default:
PADDLE_THROW("Not support type: %d, or forget to add it.", kt); PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
return "NOT JITKernel"; return "NOT JITKernel";
} }
return nullptr; return nullptr;
} }
const char* to_string(SeqPoolType tp) {
switch (tp) {
ONE_CASE(kNonePoolType);
ONE_CASE(kSum);
ONE_CASE(kAvg);
ONE_CASE(kSqrt);
default:
PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
return "NOT PoolType";
}
return nullptr;
}
#undef ONE_CASE #undef ONE_CASE
KernelType to_kerneltype(const std::string& act) { KernelType to_kerneltype(const std::string& act) {

@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
} }
const char* to_string(KernelType kt); const char* to_string(KernelType kt);
const char* to_string(SeqPoolType kt);
KernelType to_kerneltype(const std::string& act); KernelType to_kerneltype(const std::string& act);
@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
<< "],act_cand[" << to_string(attr.act_cand) << "]"; << "],act_cand[" << to_string(attr.act_cand) << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
<< to_string(attr.type) << "]";
return os;
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators

@ -41,8 +41,16 @@ typedef enum {
kCRFDecoding, kCRFDecoding,
kLayerNorm, kLayerNorm,
kNCHW16CMulNC, kNCHW16CMulNC,
kSeqPool,
} KernelType; } KernelType;
typedef enum {
kNonePoolType = 0,
kSum = 1,
kAvg,
kSqrt,
} SeqPoolType;
template <typename T> template <typename T>
struct XYZNTuples { struct XYZNTuples {
typedef T data_type; typedef T data_type;
@ -112,6 +120,21 @@ struct GRUTuples {
typedef void (*func_type)(gru_t*, const gru_attr_t*); typedef void (*func_type)(gru_t*, const gru_attr_t*);
}; };
typedef struct seq_pool_attr_s {
int h, w; // h should always be the first one
SeqPoolType type;
seq_pool_attr_s() = default;
explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
: h(height), w(width), type(pool_type) {}
} seq_pool_attr_t;
template <typename T>
struct SeqPoolTuples {
typedef T data_type;
typedef seq_pool_attr_t attr_type;
typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
};
template <typename T> template <typename T>
struct CRFDecodingTuples { struct CRFDecodingTuples {
typedef T data_type; typedef T data_type;

@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
(static_cast<int>(attr.act_cand) << act_type_shift); (static_cast<int>(attr.act_cand) << act_type_shift);
} }
template <>
size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
size_t key = attr.w;
constexpr int pool_type_shift = 3;
return (key << pool_type_shift) + static_cast<int>(attr.type);
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save