|
|
|
|
@ -9,8 +9,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
|
limitations under the License. */
|
|
|
|
|
|
|
|
|
|
#include <paddle/fluid/memory/allocation/allocator.h>
|
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
#include "cub/cub.cuh"
|
|
|
|
|
#endif
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
#include <hipcub/hipcub.hpp>
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#include <paddle/fluid/memory/allocation/allocator.h>
|
|
|
|
|
#include "paddle/fluid/framework/mixed_vector.h"
|
|
|
|
|
#include "paddle/fluid/framework/op_registry.h"
|
|
|
|
|
#include "paddle/fluid/memory/memcpy.h"
|
|
|
|
|
@ -135,17 +141,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
|
|
|
|
|
// Determine temporary device storage requirements
|
|
|
|
|
size_t temp_storage_bytes = 0;
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
|
|
|
|
|
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
|
|
|
|
|
idx_out, total_roi_num);
|
|
|
|
|
#else
|
|
|
|
|
cub::DeviceRadixSort::SortPairsDescending<T, int>(
|
|
|
|
|
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
|
|
|
|
|
idx_out, total_roi_num);
|
|
|
|
|
#endif
|
|
|
|
|
// Allocate temporary storage
|
|
|
|
|
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
|
|
|
|
|
|
|
|
|
|
// Run sorting operation
|
|
|
|
|
// sort score to get corresponding index
|
|
|
|
|
// Run sorting operation
|
|
|
|
|
// sort score to get corresponding index
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
|
|
|
|
|
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
|
|
|
|
|
keys_out, idx_in, idx_out, total_roi_num);
|
|
|
|
|
#else
|
|
|
|
|
cub::DeviceRadixSort::SortPairsDescending<T, int>(
|
|
|
|
|
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
|
|
|
|
|
keys_out, idx_in, idx_out, total_roi_num);
|
|
|
|
|
#endif
|
|
|
|
|
index_out_t.Resize({real_post_num});
|
|
|
|
|
Tensor sorted_rois;
|
|
|
|
|
sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
|
|
|
|
|
@ -167,17 +185,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
|
|
|
|
|
// Determine temporary device storage requirements
|
|
|
|
|
temp_storage_bytes = 0;
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
hipcub::DeviceRadixSort::SortPairs<int, int>(
|
|
|
|
|
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
|
|
|
|
|
batch_idx_in, index_out_t.data<int>(), real_post_num);
|
|
|
|
|
#else
|
|
|
|
|
cub::DeviceRadixSort::SortPairs<int, int>(
|
|
|
|
|
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
|
|
|
|
|
batch_idx_in, index_out_t.data<int>(), real_post_num);
|
|
|
|
|
#endif
|
|
|
|
|
// Allocate temporary storage
|
|
|
|
|
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
|
|
|
|
|
|
|
|
|
|
// Run sorting operation
|
|
|
|
|
// sort batch_id to get corresponding index
|
|
|
|
|
// Run sorting operation
|
|
|
|
|
// sort batch_id to get corresponding index
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
hipcub::DeviceRadixSort::SortPairs<int, int>(
|
|
|
|
|
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
|
|
|
|
|
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
|
|
|
|
|
#else
|
|
|
|
|
cub::DeviceRadixSort::SortPairs<int, int>(
|
|
|
|
|
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
|
|
|
|
|
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
|
|
|
|
|
|
|
|
|
|
|