optimize fasterrcnn on gpu. fix randomchoicewithmask, topk kernels

pull/11802/head
RobinGrosman 4 years ago
parent e2907c1280
commit 5d5225f2ee

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -14,9 +14,10 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_TOPK_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_TOPK_H_
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_TOPK_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_TOPK_GPU_KERNEL_H_
#include <limits>
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
@ -27,7 +28,7 @@ namespace kernel {
template <typename T, typename S>
class TopKGpuKernel : public GpuKernel {
public:
TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {}
TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), input_shape_size_(0) {}
~TopKGpuKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@ -40,26 +41,17 @@ class TopKGpuKernel : public GpuKernel {
S *k = GetDeviceAddress<S>(inputs, 1);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
S *indices = GetDeviceAddress<S>(outputs, 1);
T *data_buff = nullptr;
S *index_buff = nullptr;
if (use_share_mem_ == false) {
data_buff = GetDeviceAddress<T>(workspaces, 0);
index_buff = GetDeviceAddress<S>(workspaces, 1);
}
TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff,
reinterpret_cast<cudaStream_t>(stream_ptr));
const T init_k = std::numeric_limits<T>::lowest();
if (sorted_ == false) {
BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
FastTopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, init_k,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
input_shape_size_ = input_shapes.size();
for (size_t i = 0; i < input_shapes.size() - 1; i++) {
outer_size_ *= input_shapes[i];
}
@ -68,13 +60,6 @@ class TopKGpuKernel : public GpuKernel {
sorted_ = GetAttr<bool>(kernel_node, "sorted");
ceil_power2_ = RoundUpPower2(inner_size_);
size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S));
if (buffer_size > SHARED_MEM_PER_BLOCK) {
use_share_mem_ = false;
MS_LOG(INFO) << "CUDA share memory not enough, sort with RAM";
}
InitSizeLists();
return true;
}
@ -85,10 +70,6 @@ class TopKGpuKernel : public GpuKernel {
input_size_list_.push_back(sizeof(S));
output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
if (use_share_mem_ == false) {
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T));
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S));
}
}
private:
@ -96,8 +77,7 @@ class TopKGpuKernel : public GpuKernel {
size_t outer_size_;
size_t inner_size_;
size_t k_;
bool use_share_mem_;
size_t ceil_power2_;
int input_shape_size_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
@ -106,4 +86,4 @@ class TopKGpuKernel : public GpuKernel {
} // namespace kernel
} // namespace mindspore
#endif // TopKpuKernel
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_TOPK_GPU_KERNEL_H_

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -23,6 +23,10 @@
#define BLOCKSIZE 256
#define MAX_DIMENSION 5
template <typename T, typename S, typename K>
void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input, S *output_index, K *output_mask,
cudaStream_t stream);
template <typename T, typename S>
void CalRandomChoiceWithMask(const int &input_size, const int &input_shape_size, const int &d1, const int &d2,
const int &d3, const int &d4, const int &d5, const int &seedc, const int &count,

@ -0,0 +1,152 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/topk_lib.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/random_choice_with_mask_impl.cuh"
// Kernel started from here
#define L2_RCWM_HELPER(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, IS_DESCEND) \
do { \
L2Rcwm<T, S, K, NUM_WARP_Q, NUM_THREAD_Q, BLOCK, IS_DESCEND> \
<<<1, BLOCK, 0, stream>>>(seedc, input_size, input, output_mask, output_index, k); \
} while (0)
#define LEFT_INSERT_THREAD_QUEUE(_k, _v) \
do { \
if (is_descend ? Cmp<T>::gt(_k, warp_K_top) : Cmp<T>::lt(_k, warp_K_top)) { \
{ \
_Pragma("unroll") for (int i = thread_queue - 1; i > 0; --i) { \
threadK[i] = threadK[i - 1]; \
threadV[i] = threadV[i - 1]; \
} \
} \
threadK[0] = _k; \
threadV[0] = _v; \
++num_vals; \
} \
} while (0)
template <typename T, typename S, typename K, int warp_queue, int thread_queue, int threads_per_block, bool is_descend>
__global__ void L2Rcwm(int seedc, int input_size, const K *input, K *output_mask, S *output_index, int k) {
constexpr int kNumWarps = threads_per_block / kWarpSize;
constexpr T init_K = static_cast<T>(-2.0);
constexpr S init_V = static_cast<S>(0);
__shared__ T shared_K[kNumWarps * warp_queue];
__shared__ S shared_V[kNumWarps * warp_queue];
curandState devState;
curand_init(seedc, threadIdx.x, 0, &devState);
T threadK[thread_queue]; // NOLINT
S threadV[thread_queue]; // NOLINT
T *warp_K;
S *warp_V;
T warp_K_top = init_K;
int k_minus_1 = k - 1;
int num_vals = 0;
int limit = (input_size / kWarpSize) * kWarpSize;
int i = threadIdx.x;
// init begin
_Pragma("unroll") for (int i = 0; i < thread_queue; ++i) {
threadK[i] = init_K;
threadV[i] = init_V;
}
int laneId = GetLaneId();
int warpId = threadIdx.x / kWarpSize; // 0,1,2 or 3
// warp shared memory start address
warp_K = shared_K + warpId * warp_queue;
warp_V = shared_V + warpId * warp_queue;
for (int i = laneId; i < warp_queue; i += kWarpSize) {
warp_K[i] = init_K;
warp_V[i] = init_V;
}
// sync till all threads init done
__syncwarp();
// insert begin
for (; i < limit; i += threads_per_block) {
T rand_num = input[i] ? __uint2float_rn(curand(&devState)) : init_K;
LEFT_INSERT_THREAD_QUEUE(rand_num, i);
// CHECK_AND_MERGE_THREAD_QUEUE() begin
bool needSort = (num_vals == thread_queue);
needSort = __any_sync(0xffffffff, needSort);
if (!needSort) continue;
MergeWarpQueue<T, S, warp_queue, thread_queue, is_descend>(threadK, threadV, warp_K, warp_V);
num_vals = 0;
_Pragma("unroll") for (int i = 0; i < thread_queue; ++i) {
threadK[i] = init_K;
threadV[i] = init_V;
}
warp_K_top = warp_K[k_minus_1];
__syncwarp();
}
if (i < input_size) {
T rand_num = input[i] ? __uint2float_rn(curand(&devState)) : init_K;
LEFT_INSERT_THREAD_QUEUE(rand_num, i);
}
// reduce begin
MergeWarpQueue<T, S, warp_queue, thread_queue, is_descend>(threadK, threadV, warp_K, warp_V);
__syncthreads();
SortBlockWide<kNumWarps, threads_per_block, T, S, warp_queue, is_descend>(shared_K, shared_V);
// ship data from shared memory to output buffer
for (int i = threadIdx.x; i < k; i += blockDim.x) {
output_mask[i] = shared_K[i] > static_cast<T>(-1.0) ? true : false;
output_index[i] = shared_V[i];
}
}
template <typename T, typename S, typename K>
void RCWMScaleK(int seedc, int input_size, K *input, int k, S *output_index, K *output_mask, cudaStream_t stream) {
if (k <= 32) {
// num-threads-of-block, warp-queue-size, thread-queue-size
L2_RCWM_HELPER(256, 32, 2, true);
} else if (k <= 64) {
L2_RCWM_HELPER(256, 64, 3, true);
} else if (k <= 128) {
L2_RCWM_HELPER(256, 128, 3, true);
} else if (k <= 256) {
L2_RCWM_HELPER(256, 256, 4, true);
} else if (k <= 512) {
L2_RCWM_HELPER(256, 512, 8, true);
} else if (k <= 1024) {
L2_RCWM_HELPER(128, 1024, 8, true);
} else if (k <= 2048) {
L2_RCWM_HELPER(64, 2048, 8, true);
}
}
template <typename T, typename S, typename K>
void CalRandomChoiceWithMaskSmall(int input_size, int seedc, int count, K *input, S *output_index, K *output_mask,
cudaStream_t stream) {
RCWMScaleK<T, S, K>(seedc, input_size, input, count, output_index, output_mask, stream);
}
template void CalRandomChoiceWithMaskSmall<float, int, bool>(int input_size, int seedc, int count, bool *input,
int *output_index, bool *output_mask, cudaStream_t stream);

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -14,19 +14,14 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_
#include <cuda_runtime.h>
#include "runtime/device/gpu/cuda_common.h"
template <typename T, typename S>
void TopK(const size_t &outer, const size_t &inner, const T *input_addr, const S *k, T *output, S *indices,
T *data_buff, S *index_buff, cudaStream_t stream);
void FastTopK(const int outer, const int inner, const T *input_addr, const S *k, T *output, S *indices, const T initK,
cudaStream_t stream);
template <typename T, typename S>
void BitonicSortByKey(const size_t &outer, const size_t &inner, T *input, S *indices, T *data_buff, S *index_buff,
cudaStream_t stream);
size_t RoundUpPower2(size_t v);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_TOPK_IMPL_CUH_

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -39,17 +39,22 @@ class RandomChoiceWithMaskGpuKernel : public GpuKernel {
T *input = GetDeviceAddress<T>(inputs, 0);
S *output_index = GetDeviceAddress<S>(outputs, 0);
T *output_mask = GetDeviceAddress<T>(outputs, 1);
S *index_buff = GetDeviceAddress<S>(workspaces, 0);
S *mask_buff = GetDeviceAddress<S>(workspaces, 1);
S *rank_buff = GetDeviceAddress<S>(workspaces, 2);
S *Tnum_buff = GetDeviceAddress<S>(workspaces, 3);
S *tmp_buff = GetDeviceAddress<S>(workspaces, 4);
void *States = GetDeviceAddress<void *>(workspaces, 5);
curandState *devStates = reinterpret_cast<curandState *>(States);
CalRandomChoiceWithMask(input_size_, input_shape_size_, input_shape_5D_[0], input_shape_5D_[1], input_shape_5D_[2],
input_shape_5D_[3], input_shape_5D_[4], seedc_, count_, input, output_index, output_mask,
index_buff, mask_buff, rank_buff, Tnum_buff, tmp_buff, devStates,
reinterpret_cast<cudaStream_t>(stream_ptr));
if (count_ > kSmallK || input_shape_size_ > 1) {
S *index_buff = GetDeviceAddress<S>(workspaces, 0);
S *mask_buff = GetDeviceAddress<S>(workspaces, 1);
S *rank_buff = GetDeviceAddress<S>(workspaces, 2);
S *Tnum_buff = GetDeviceAddress<S>(workspaces, 3);
S *tmp_buff = GetDeviceAddress<S>(workspaces, 4);
void *States = GetDeviceAddress<void *>(workspaces, 5);
curandState *devStates = reinterpret_cast<curandState *>(States);
CalRandomChoiceWithMask(input_size_, input_shape_size_, input_shape_5D_[0], input_shape_5D_[1],
input_shape_5D_[2], input_shape_5D_[3], input_shape_5D_[4], seedc_, count_, input,
output_index, output_mask, index_buff, mask_buff, rank_buff, Tnum_buff, tmp_buff,
devStates, reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CalRandomChoiceWithMaskSmall<float, S, T>(input_size_, seedc_, count_, input, output_index, output_mask,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}
@ -94,7 +99,9 @@ class RandomChoiceWithMaskGpuKernel : public GpuKernel {
}
count_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "count"));
// upper ceiling for input for ceil_power2
ceil_power2_ = RcwmRoundUpPower2(input_size_);
if (count_ > kSmallK || input_shape_size_ > 1) {
ceil_power2_ = RcwmRoundUpPower2(input_size_);
}
InitSizeLists();
return true;
}
@ -104,16 +111,19 @@ class RandomChoiceWithMaskGpuKernel : public GpuKernel {
input_size_list_.push_back(input_size_ * sizeof(T));
output_size_list_.push_back(count_ * input_shape_size_ * sizeof(S));
output_size_list_.push_back(count_ * sizeof(T));
workspace_size_list_.push_back(input_size_ * input_shape_size_ * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(S));
int blocknum = std::ceil(static_cast<float>(ceil_power2_) / BLOCKSIZE);
workspace_size_list_.push_back(blocknum * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(curandState));
if (count_ > kSmallK || input_shape_size_ > 1) {
workspace_size_list_.push_back(input_size_ * input_shape_size_ * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(S));
int blocknum = std::ceil(static_cast<float>(ceil_power2_) / BLOCKSIZE);
workspace_size_list_.push_back(blocknum * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(S));
workspace_size_list_.push_back(ceil_power2_ * sizeof(curandState));
}
}
private:
const int kSmallK = 2048;
int input_shape_size_;
int seedc_;
int input_size_;

@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2020-21 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -21,6 +21,7 @@ import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops import operations as P
class RCWM_count_in(nn.Cell):
def __init__(self):
super(RCWM_count_in, self).__init__()
@ -29,6 +30,7 @@ class RCWM_count_in(nn.Cell):
def construct(self, x):
return self.RCWM_count_in(x)
class RCWM_count_out(nn.Cell):
def __init__(self):
super(RCWM_count_out, self).__init__()
@ -37,6 +39,7 @@ class RCWM_count_out(nn.Cell):
def construct(self, x):
return self.RCWM_count_out(x)
class RCWM_3D(nn.Cell):
def __init__(self):
super(RCWM_3D, self).__init__()
@ -45,6 +48,16 @@ class RCWM_3D(nn.Cell):
def construct(self, x):
return self.RCWM_3D(x)
class RCWM_1D(nn.Cell):
def __init__(self):
super(RCWM_1D, self).__init__()
self.RCWM_1D = P.RandomChoiceWithMask(count=10, seed=9)
def construct(self, x):
return self.RCWM_1D(x)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
@ -58,12 +71,14 @@ def test_RCWM_3D():
assert output1.shape == expect1
assert output2.shape == expect2
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_RCWM_count_out():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
input_tensor = Tensor(np.array([[1, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 1]]).astype(np.bool))
input_tensor = Tensor(np.array([[1, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1],
[0, 0, 0, 1]]).astype(np.bool))
expect1 = (10, 2)
expect2 = (10,)
rcwm = RCWM_count_out()
@ -71,15 +86,36 @@ def test_RCWM_count_out():
assert output1.shape == expect1
assert output2.shape == expect2
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_RCWM_count_in():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
input_tensor = Tensor(np.array([[1, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 1]]).astype(np.bool))
input_tensor = Tensor(np.array([[1, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1],
[0, 0, 0, 1]]).astype(np.bool))
expect1 = (4, 2)
expect2 = (4,)
rcwm = RCWM_count_in()
output1, output2 = rcwm(input_tensor)
assert output1.shape == expect1
assert output2.shape == expect2
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_RCWM_1D():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
input_tensor = Tensor(
np.array([1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]).astype(np.bool))
expect_index = np.array([[11], [9], [2], [15], [10], [7],
[8], [0], [0], [0]]).astype(np.int32)
expect_mask = np.array(
[True, True, True, True, True, True, True, True, False, False])
rcwm = RCWM_1D()
output1, output2 = rcwm(input_tensor)
print(output1.asnumpy())
print(output2)
assert np.array_equal(output1.asnumpy(), expect_index)
assert np.array_equal(output2.asnumpy(), expect_mask)

@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2020-21 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -24,7 +24,7 @@ from mindspore.ops import operations as P
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_topk():
def test_topk_small_2d():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
x_np = np.random.rand(3, 4).astype(np.float32)
@ -36,7 +36,20 @@ def test_topk():
x_np = np.random.rand(3, 4).astype(np.float32)
k = 4
ms_output = P.TopK(False)(Tensor(x_np), k)
assert np.allclose(ms_output[0].asnumpy(), x_np)
np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_topk_3d():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
x_np = np.random.rand(2, 256, 128).astype(np.float32)
k = 4
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(2, 3, 4).astype(np.float32)
k = 2
@ -44,6 +57,12 @@ def test_topk():
np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_topk_big_2d():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
x_np = np.random.rand(512, 1024).astype(np.float32)
k = 512
ms_output = P.TopK(True)(Tensor(x_np), k)
@ -51,32 +70,69 @@ def test_topk():
assert np.allclose(ms_output[0].asnumpy(), np_output)
# sorted elements num greater than max thread per block
x_np = np.random.rand(512, 2048).astype(np.float32)
x_np = np.random.rand(128, 2048).astype(np.float32)
k = 1
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(512, 2048).astype(np.float32)
x_np = np.random.rand(32, 2048).astype(np.float32)
k = 2048
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
# sorted elements num greater than max share memory per block
x_np = np.random.rand(512, 40960).astype(np.float32)
x_np = np.random.rand(16, 40960).astype(np.float32)
k = 1
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(512, 40960).astype(np.float32)
k = 40960
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_topk_big_k():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
x_np = np.random.rand(8, 40960).astype(np.float32)
k = 4096
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(512, 40960).astype(np.float32)
k = 40960
ms_output = P.TopK(False)(Tensor(x_np), k)
assert np.allclose(ms_output[0].asnumpy(), x_np)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_topk_1d():
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
x_np = np.random.rand(12).astype(np.float32)
k = 4
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np)[::-1][0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(1200).astype(np.float32)
k = 256
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np)[::-1][0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(250000).astype(np.float32)
k = 2000
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np)[::-1][0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(10240).astype(np.float32)
k = 4096
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np)[::-1][0:k]
assert np.allclose(ms_output[0].asnumpy(), np_output)
x_np = np.random.rand(720).astype(np.float32)
k = 720
ms_output = P.TopK(True)(Tensor(x_np), k)
np_output = np.sort(x_np)[::-1][0:k]
assert np.allclose(ms_output[0].asnumpy()[:k], np_output)

Loading…
Cancel
Save