Add the CUDA kernel for beam_search op (#15020)
* Refine the beam_search op and test. * A basic CUDA implementation of beam_search for small batch_size. * Implement CUDA kernel for beam_search_op. * Use multiple CUDA threads in the same block to select the top beam. * Update the python api of beam_search op. * Enable extend function in CPU kernel of beam_search op. * Unify the CUDA codes. test=develop * Unify the CPU kernel of beam_search op. * Ensure the seletced items of beam_search_op's CPU kernel sorted by scores. * Update the description of beam_search in API.spec. * Enable the use of CUDA kernel in beam_search op. * Exclude the beam_search's CUDA unittest when there is no CUDA gpu, and delete some debuging statements. test=develop * Follow comments. test=develop * Call the CPU kernel for beam_search op when batch_size > 4. test=develop * Remove the except of is_empty op in PrepareData. test=developinference-pre-release-gpu
parent
ed1726eaaa
commit
3008fa1261
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,24 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/beam_search_op.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OP_CUDA_KERNEL(
|
||||
beam_search,
|
||||
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
|
||||
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
|
||||
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
|
||||
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
|
@ -1,92 +0,0 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/beam_search_op.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
|
||||
namespace paddle {
|
||||
namespace test {
|
||||
|
||||
using std::vector;
|
||||
using framework::LoDTensor;
|
||||
using framework::LoD;
|
||||
using operators::BeamSearch;
|
||||
using paddle::platform::CPUPlace;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
|
||||
void CreateInput(LoDTensor* ids, LoDTensor* scores) {
|
||||
LoD lod;
|
||||
vector<size_t> level0({0, 2, 4});
|
||||
vector<size_t> level1({0, 1, 2, 3, 4});
|
||||
lod.push_back(level0);
|
||||
lod.push_back(level1);
|
||||
ids->set_lod(lod);
|
||||
scores->set_lod(lod);
|
||||
|
||||
auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
|
||||
ids->Resize(dims);
|
||||
scores->Resize(dims);
|
||||
CPUPlace place;
|
||||
|
||||
auto* ids_data = ids->mutable_data<int64_t>(place);
|
||||
auto* scores_data = scores->mutable_data<float>(place);
|
||||
vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
|
||||
vector<float> _scores(
|
||||
{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
|
||||
|
||||
for (int i = 0; i < 12; i++) {
|
||||
ids_data[i] = _ids[i];
|
||||
scores_data[i] = _scores[i];
|
||||
}
|
||||
}
|
||||
|
||||
// It seems that beam_search_op has bugs.
|
||||
TEST(DISABLED_beam_search_op, run) {
|
||||
CPUPlace place;
|
||||
LoDTensor ids, scores;
|
||||
CreateInput(&ids, &scores);
|
||||
|
||||
LoDTensor pre_ids;
|
||||
pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
|
||||
}
|
||||
LoDTensor pre_scores;
|
||||
pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
|
||||
}
|
||||
|
||||
BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
|
||||
LoDTensor sids, sscores;
|
||||
beamsearch(pre_ids, pre_scores, &sids, &sscores);
|
||||
|
||||
LOG(INFO) << "score: " << sscores << endl;
|
||||
|
||||
ASSERT_EQ(sids.lod(), sscores.lod());
|
||||
|
||||
vector<int> tids({4, 2, 3, 8});
|
||||
vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
|
||||
ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace paddle
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,119 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "paddle/fluid/framework/lod_tensor.h"
|
||||
#include "paddle/fluid/platform/device_context.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
namespace math {
|
||||
|
||||
/*
|
||||
* This is an implementation of beam search.
|
||||
*
|
||||
* To explain the details, lets take machine translation task for example, in
|
||||
* this task, one source sentence is translated to multiple target sentences,
|
||||
* during this period, one sentence will be translated to multiple translation
|
||||
* prefixes(target sentence that have not ended), in each time step a prefix
|
||||
* will have some candidates, input the candidate ids and their corresponding
|
||||
* scores (probabilities), it will sort and select the top beam_size candidates
|
||||
* for each source sentence, and store the selected candidates's score and their
|
||||
* corresponding ids to LoDTensors.
|
||||
*
|
||||
* A detailed example:
|
||||
*
|
||||
* Input
|
||||
*
|
||||
* ids:
|
||||
* - LoD (should have 2 levels)
|
||||
* - first level: [0, 1, 4]
|
||||
* - second level: [0, 1, 2, 3, 4]
|
||||
* - tensor's data:
|
||||
* [[4, 2, 5]
|
||||
* [2, 1, 3]
|
||||
* [3, 5, 2]
|
||||
* [8, 2, 1]]
|
||||
*
|
||||
* scores:
|
||||
* - LoD same as `ids`
|
||||
* - tensor's data
|
||||
* [[0.5, 0.3, 0.2]
|
||||
* [0.6, 0.3, 0.1]
|
||||
* [0.9, 0.5, 0.1]
|
||||
* [0.7, 0.5, 0.1]]
|
||||
*
|
||||
* The inputs means that there are 2 source sentences to translate, and the
|
||||
* first source has 1 prefix, the second source has 2 prefix.
|
||||
*
|
||||
* Lets assume beam size is 2, and the beam search's output should be
|
||||
* - LoD
|
||||
* - first level: [0, 1, 2]
|
||||
* - second level: [0, 2, 4]
|
||||
* - id tensor's data
|
||||
* [[4,
|
||||
* 1,
|
||||
* 3,
|
||||
* 8]]
|
||||
* - score tensor's data
|
||||
* [[0.5,
|
||||
* 0.3,
|
||||
* 0.9,
|
||||
* 0.7]]
|
||||
*
|
||||
* TODO all the prune operations should be in the beam search, so it is better
|
||||
* to split the beam search algorithm into a sequence of smaller operators, and
|
||||
* the prune operators can be inserted in this sequence.
|
||||
*/
|
||||
template <typename DeviceContext, typename T>
|
||||
class BeamSearchFunctor {
|
||||
public:
|
||||
/*
|
||||
* The main function of beam search.
|
||||
*
|
||||
* @selected_ids: a [None, 1]-shaped tensor with LoD.
|
||||
* In a machine translation model, it might be the candidate term id sets,
|
||||
* each set stored as a varience-length sequence.
|
||||
* The format might be described with a two-level LoD
|
||||
* - [[0 1],
|
||||
* [0 1 2]]
|
||||
* - [[]
|
||||
* [0 1]]
|
||||
* the first level of LoD tells that there are two source sentences. The
|
||||
* second level describes the details of the candidate id set's offsets in
|
||||
* the source sentences.
|
||||
*
|
||||
* @selected_scores: a LoD tensor with the same shape and LoD with
|
||||
* selected_ids.
|
||||
* It stores the corresponding scores of candidate ids in selected_ids.
|
||||
*
|
||||
* Return false if all the input tensor is empty, in machine translation task
|
||||
* that means no candidates is provided, and the task will stop running.
|
||||
*/
|
||||
void operator()(const DeviceContext& context,
|
||||
const framework::LoDTensor* pre_ids,
|
||||
const framework::LoDTensor* pre_scores,
|
||||
const framework::LoDTensor* ids,
|
||||
const framework::LoDTensor* scores,
|
||||
framework::LoDTensor* selected_ids,
|
||||
framework::LoDTensor* selected_scores, size_t level,
|
||||
size_t beam_size, int end_id, bool is_accumulated);
|
||||
};
|
||||
|
||||
} // namespace math
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
@ -0,0 +1,141 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/math/beam_search.h"
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
|
||||
void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
|
||||
paddle::framework::LoDTensor* scores,
|
||||
paddle::framework::LoDTensor* pre_ids,
|
||||
paddle::framework::LoDTensor* pre_scores) {
|
||||
// lod
|
||||
paddle::framework::LoD lod;
|
||||
std::vector<size_t> level0({0, 2, 4});
|
||||
std::vector<size_t> level1({0, 1, 2, 3, 4});
|
||||
lod.push_back(level0);
|
||||
lod.push_back(level1);
|
||||
ids->set_lod(lod);
|
||||
scores->set_lod(lod);
|
||||
|
||||
auto dims = paddle::framework::make_ddim({4, 3});
|
||||
ids->Resize(dims);
|
||||
scores->Resize(dims);
|
||||
|
||||
paddle::platform::CPUPlace place;
|
||||
auto* ids_data = ids->mutable_data<int64_t>(place);
|
||||
auto* scores_data = scores->mutable_data<float>(place);
|
||||
std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
|
||||
std::vector<float> scores_vec_data(
|
||||
{0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
|
||||
|
||||
CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
|
||||
CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
|
||||
|
||||
for (int i = 0; i < ids->numel(); i++) {
|
||||
ids_data[i] = ids_vec_data[i];
|
||||
scores_data[i] = scores_vec_data[i];
|
||||
}
|
||||
|
||||
// pre_ids
|
||||
pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
|
||||
}
|
||||
|
||||
// pre_scores
|
||||
pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DeviceContext, typename Place>
|
||||
void TestBeamSearch() {
|
||||
paddle::framework::LoDTensor ids;
|
||||
paddle::framework::LoDTensor scores;
|
||||
paddle::framework::LoDTensor pre_ids;
|
||||
paddle::framework::LoDTensor pre_scores;
|
||||
|
||||
auto* place = new Place();
|
||||
DeviceContext* context = new DeviceContext(*place);
|
||||
if (paddle::platform::is_cpu_place(*place)) {
|
||||
PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
|
||||
} else {
|
||||
paddle::framework::LoDTensor cpu_ids;
|
||||
paddle::framework::LoDTensor cpu_scores;
|
||||
paddle::framework::LoDTensor cpu_pre_ids;
|
||||
paddle::framework::LoDTensor cpu_pre_scores;
|
||||
|
||||
PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
|
||||
|
||||
TensorCopySync(cpu_ids, *place, &ids);
|
||||
TensorCopySync(cpu_scores, *place, &scores);
|
||||
TensorCopySync(cpu_pre_ids, *place, &pre_ids);
|
||||
TensorCopySync(cpu_pre_scores, *place, &pre_scores);
|
||||
|
||||
ids.set_lod(cpu_ids.lod());
|
||||
scores.set_lod(cpu_scores.lod());
|
||||
pre_ids.set_lod(cpu_pre_ids.lod());
|
||||
pre_scores.set_lod(cpu_pre_scores.lod());
|
||||
}
|
||||
|
||||
paddle::framework::LoDTensor selected_ids;
|
||||
paddle::framework::LoDTensor selected_scores;
|
||||
|
||||
size_t level = 0;
|
||||
size_t beam_size = 2;
|
||||
int end_id = 0;
|
||||
paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
|
||||
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
|
||||
&selected_scores, level, beam_size, end_id, true);
|
||||
|
||||
ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
|
||||
|
||||
paddle::framework::LoDTensor cpu_selected_ids;
|
||||
paddle::framework::LoDTensor cpu_selected_scores;
|
||||
if (paddle::platform::is_cpu_place(*place)) {
|
||||
cpu_selected_ids = selected_ids;
|
||||
cpu_selected_scores = selected_scores;
|
||||
} else {
|
||||
TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
|
||||
&cpu_selected_ids);
|
||||
TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
|
||||
&cpu_selected_scores);
|
||||
cpu_selected_ids.set_lod(selected_ids.lod());
|
||||
cpu_selected_scores.set_lod(selected_scores.lod());
|
||||
}
|
||||
|
||||
std::vector<int64_t> expected_ids({4, 5, 3, 8});
|
||||
std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
|
||||
for (int i = 0; i < 4; i++) {
|
||||
ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
|
||||
ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
|
||||
}
|
||||
|
||||
delete place;
|
||||
delete context;
|
||||
}
|
||||
|
||||
TEST(BeamSearch, CPU) {
|
||||
TestBeamSearch<paddle::platform::CPUDeviceContext,
|
||||
paddle::platform::CPUPlace>();
|
||||
}
|
||||
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
TEST(BeamSearch, GPU) {
|
||||
TestBeamSearch<paddle::platform::CUDADeviceContext,
|
||||
paddle::platform::CUDAPlace>();
|
||||
}
|
||||
#endif
|
Loading…
Reference in new issue