You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
150 lines
5.1 KiB
150 lines
5.1 KiB
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License. */
|
|
|
|
#include "paddle/fluid/operators/jit/gen/embseqpool.h"
|
|
#include <stddef.h> // offsetof
|
|
#include <vector>
|
|
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
|
|
#include "paddle/fluid/operators/jit/registry.h"
|
|
#include "paddle/fluid/platform/cpu_info.h"
|
|
|
|
namespace paddle {
|
|
namespace operators {
|
|
namespace jit {
|
|
namespace gen {
|
|
|
|
void EmbSeqPoolJitCode::genCode() {
|
|
preCode();
|
|
constexpr int block = YMM_FLOAT_BLOCK;
|
|
constexpr int max_num_regs = 8;
|
|
const int num_block = tbl_w_ / block;
|
|
const int num_groups = num_block / max_num_regs;
|
|
const size_t block_size = sizeof(float) * block;
|
|
std::vector<int> groups(num_groups, max_num_regs);
|
|
int rest_num_regs = num_block % max_num_regs;
|
|
if (rest_num_regs > 0) {
|
|
groups.push_back(rest_num_regs);
|
|
}
|
|
|
|
// protect param_dst
|
|
mov(reg_ptr_param_dst, param_dst);
|
|
mov(reg_idx_width_in_byte,
|
|
qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]);
|
|
mov(reg_idx_height,
|
|
qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]);
|
|
mov(rax, sizeof(int64_t));
|
|
mul(reg_idx_width_in_byte);
|
|
mov(reg_idx_width_in_byte, rax);
|
|
const size_t tbl_width_in_byte = sizeof(float) * tbl_w_;
|
|
int acc_num_regs = 0;
|
|
for (int num_regs : groups) {
|
|
Label l_next_idx_w, l_next_idx_h, l_save_now;
|
|
xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte);
|
|
mov(reg_ptr_dst_i, reg_ptr_param_dst);
|
|
add(reg_ptr_dst_i, acc_num_regs * block_size);
|
|
|
|
L(l_next_idx_w);
|
|
{
|
|
// h == 0
|
|
mov(reg_ptr_idx_i, param_idx);
|
|
add(reg_ptr_idx_i, reg_idx_w_i_in_byte);
|
|
mov(reg_idx, qword[reg_ptr_idx_i]);
|
|
mov(rax, tbl_width_in_byte);
|
|
mul(reg_idx);
|
|
mov(reg_ptr_tbl_i, rax); // reg is offset now
|
|
add(reg_ptr_tbl_i, param_tbl); // reg is ptr_i now
|
|
size_t w_offset = 0;
|
|
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
|
|
vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]);
|
|
w_offset += block_size;
|
|
}
|
|
add(reg_ptr_idx_i, reg_idx_width_in_byte);
|
|
|
|
// end condition of idx h
|
|
mov(reg_idx_h_end, reg_idx_height);
|
|
mov(rax, reg_idx_width_in_byte);
|
|
mul(reg_idx_h_end);
|
|
mov(reg_idx_h_end, rax);
|
|
add(reg_idx_h_end, reg_idx_w_i_in_byte);
|
|
add(reg_idx_h_end, param_idx);
|
|
|
|
cmp(reg_ptr_idx_i, reg_idx_h_end);
|
|
jge(l_save_now, T_NEAR);
|
|
L(l_next_idx_h);
|
|
{
|
|
mov(reg_idx, qword[reg_ptr_idx_i]);
|
|
mov(reg_ptr_tbl_i, reg_idx);
|
|
mov(rax, tbl_width_in_byte);
|
|
mul(reg_idx);
|
|
mov(reg_ptr_tbl_i, rax);
|
|
add(reg_ptr_tbl_i, param_tbl);
|
|
size_t w_offset = 0;
|
|
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
|
|
vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]);
|
|
vaddps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs),
|
|
ymm_t(reg_i));
|
|
w_offset += block_size;
|
|
}
|
|
add(reg_ptr_idx_i, reg_idx_width_in_byte);
|
|
cmp(reg_ptr_idx_i, reg_idx_h_end);
|
|
jl(l_next_idx_h, T_NEAR);
|
|
} // end of idx h
|
|
L(l_save_now);
|
|
// avg or sqrt here, if needed
|
|
w_offset = 0;
|
|
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
|
|
vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs));
|
|
w_offset += block_size;
|
|
}
|
|
add(reg_ptr_dst_i, tbl_width_in_byte);
|
|
add(reg_idx_w_i_in_byte, sizeof(int64_t));
|
|
cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte);
|
|
jl(l_next_idx_w, T_NEAR);
|
|
} // end of idx w
|
|
|
|
acc_num_regs += num_regs;
|
|
add(param_tbl, num_regs * block_size); // do not use acc_num_regs
|
|
} // end of groups
|
|
postCode();
|
|
}
|
|
|
|
class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
|
|
public:
|
|
bool UseMe(const emb_seq_pool_attr_t& attr) const override {
|
|
return platform::MayIUse(platform::avx) &&
|
|
attr.table_width % YMM_FLOAT_BLOCK == 0;
|
|
}
|
|
size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
|
|
return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8;
|
|
}
|
|
std::unique_ptr<GenBase> CreateJitCode(
|
|
const emb_seq_pool_attr_t& attr) const override {
|
|
PADDLE_ENFORCE_GT(attr.table_height, 0);
|
|
PADDLE_ENFORCE_GT(attr.table_width, 0);
|
|
PADDLE_ENFORCE_GT(attr.index_height, 0);
|
|
PADDLE_ENFORCE_GT(attr.index_width, 0);
|
|
PADDLE_ENFORCE_GT(attr.out_width, 0);
|
|
return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
|
|
}
|
|
};
|
|
|
|
} // namespace gen
|
|
} // namespace jit
|
|
} // namespace operators
|
|
} // namespace paddle
|
|
|
|
namespace gen = paddle::operators::jit::gen;
|
|
|
|
REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
|