!9140 [MS][GPU][CUDA][DynamicShape] - New GPU kernel -> UnsortedSegmentMin + DynamicShape support changes to API + inferImpl func (+SegMax ST correction)
From: @danishnxt Reviewed-by: Signed-off-by:pull/9140/MERGE
commit
dc62360eed
@ -0,0 +1,56 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/gpu/arrays/unsorted_segment_min_gpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
UnsortedSegmentMin,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
|
||||
UnsortedSegmentMinGpuKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
UnsortedSegmentMin,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
|
||||
UnsortedSegmentMinGpuKernel, half)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
UnsortedSegmentMin,
|
||||
KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
UnsortedSegmentMinGpuKernel, int)
|
||||
// Dynamic Mode
|
||||
MS_REG_GPU_KERNEL_ONE(UnsortedSegmentMin,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
UnsortedSegmentMinGpuKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(UnsortedSegmentMin,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
UnsortedSegmentMinGpuKernel, half)
|
||||
MS_REG_GPU_KERNEL_ONE(UnsortedSegmentMin,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
UnsortedSegmentMinGpuKernel, int)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
@ -0,0 +1,131 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_UNSORTED_SEGMENT_MIN_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_UNSORTED_SEGMENT_MIN_H_
|
||||
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/unsorted_segment_min.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class UnsortedSegmentMinGpuKernel : public GpuKernel {
|
||||
public:
|
||||
UnsortedSegmentMinGpuKernel() { ResetResource(); }
|
||||
~UnsortedSegmentMinGpuKernel() override = default;
|
||||
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
T *input_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
int *indices_addr = GetDeviceAddress<int>(inputs, 1);
|
||||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
CalUnsortedSegmentMin(input_addr, indices_addr, num_segments_, outer_size_, inner_size_, output_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shapes);
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "UnsortedSegmentMin input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
auto segment_ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num == 3) {
|
||||
MS_LOG(INFO) << "UnsortedSegmentMin Kernel Input count is 3 - dynamic mode";
|
||||
} else {
|
||||
MS_LOG(INFO) << "UnsortedSegmentMin Kernel Input count is 2";
|
||||
}
|
||||
|
||||
num_segments_ = output_shapes[0];
|
||||
input_size_ = 1;
|
||||
for (size_t i = 0; i < input_shapes.size(); i++) {
|
||||
input_size_ *= input_shapes[i];
|
||||
}
|
||||
|
||||
segment_ids_size_ = 1;
|
||||
for (size_t i = 0; i < segment_ids_shapes.size(); i++) {
|
||||
segment_ids_size_ *= segment_ids_shapes[i];
|
||||
}
|
||||
|
||||
output_size_ = 1;
|
||||
for (size_t i = 0; i < output_shapes.size(); i++) {
|
||||
output_size_ *= output_shapes[i];
|
||||
}
|
||||
|
||||
outer_size_ = input_shapes[0];
|
||||
inner_size_ = 1;
|
||||
for (size_t i = 1; i < input_shapes.size(); i++) {
|
||||
inner_size_ *= input_shapes[i];
|
||||
}
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
void ResetResource() noexcept override {
|
||||
num_segments_ = 1;
|
||||
inner_size_ = 1;
|
||||
outer_size_ = 1;
|
||||
input_size_ = 1;
|
||||
segment_ids_size_ = 1;
|
||||
output_size_ = 1;
|
||||
is_null_input_ = false;
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
workspace_size_list_.clear();
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitSizeLists() override {
|
||||
input_size_list_.push_back(input_size_ * sizeof(T));
|
||||
input_size_list_.push_back(segment_ids_size_ * sizeof(int));
|
||||
output_size_list_.push_back(output_size_ * sizeof(T));
|
||||
}
|
||||
|
||||
private:
|
||||
int num_segments_;
|
||||
size_t inner_size_;
|
||||
size_t outer_size_;
|
||||
size_t input_size_;
|
||||
size_t segment_ids_size_;
|
||||
size_t output_size_;
|
||||
bool is_null_input_;
|
||||
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_UNSORTED_SEGMENT_MIN_H_
|
@ -0,0 +1,79 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/unsorted_segment_min.cuh"
|
||||
#include <limits>
|
||||
|
||||
template<typename T>
|
||||
__device__ __forceinline__ void max_val_init(T *init_val) {
|
||||
*init_val = std::numeric_limits<T>::max();
|
||||
}
|
||||
// Handle fp16 differently for assignment
|
||||
template<>
|
||||
__device__ __forceinline__ void max_val_init(half *init_val) {
|
||||
*init_val = __int2half_rd(65504); // Max value for Half
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void UnsortedSegmentMin(const T *input, const int *segment_ids, const int num_segments, size_t outer_size,
|
||||
size_t inner_size, T init_K, T *output) {
|
||||
max_val_init(&init_K);
|
||||
for (int t_idx = blockIdx.x * blockDim.x + threadIdx.x; t_idx < KWARPSIZE * num_segments * inner_size;
|
||||
t_idx += blockDim.x * gridDim.x) {
|
||||
int segment_id = t_idx / KWARPSIZE / inner_size;
|
||||
int inner_id = t_idx / KWARPSIZE % inner_size;
|
||||
int lane_id = threadIdx.x % KWARPSIZE;
|
||||
T threadK = init_K;
|
||||
|
||||
for (int i = lane_id; i < outer_size; i += KWARPSIZE) {
|
||||
if (segment_ids[i] != segment_id) continue;
|
||||
T other_K = input[i * inner_size + inner_id];
|
||||
if (threadK > other_K) {
|
||||
threadK = other_K;
|
||||
}
|
||||
}
|
||||
__syncwarp();
|
||||
for (int offset = KWARPSIZE / 2; offset > 0; offset /= 2) {
|
||||
T other_K = __shfl_down_sync(0xffffffff, threadK, offset);
|
||||
if (threadK > other_K) {
|
||||
threadK = other_K;
|
||||
}
|
||||
}
|
||||
__syncwarp();
|
||||
|
||||
if (lane_id == 0) {
|
||||
output[segment_id * inner_size + inner_id] = threadK;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int num_segments, size_t outer_size,
|
||||
size_t inner_size, T *output, cudaStream_t stream) {
|
||||
int size = (inner_size * KWARPSIZE * num_segments);
|
||||
T init_K = std::numeric_limits<T>::lowest(); // only init here - overwritten later
|
||||
UnsortedSegmentMin<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, segment_ids, num_segments, outer_size,
|
||||
inner_size, init_K, output);
|
||||
return;
|
||||
}
|
||||
|
||||
template void CalUnsortedSegmentMin<float>(const float *input, const int *segment_ids, const int num_segments,
|
||||
size_t outer_size, size_t inner_size, float *output, cudaStream_t stream);
|
||||
template void CalUnsortedSegmentMin<half>(const half *input, const int *segment_ids, const int num_segments,
|
||||
size_t outer_size, size_t inner_size, half *output, cudaStream_t stream);
|
||||
template void CalUnsortedSegmentMin<int>(const int *input, const int *segment_ids, const int num_segments,
|
||||
size_t outer_size, size_t inner_size, int *output, cudaStream_t stream);
|
@ -0,0 +1,28 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MIN_H_
|
||||
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORTED_SEGMENT_MIN_H_
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include "runtime/device/gpu/cuda_common.h"
|
||||
|
||||
// Setting warp size to sync data across threads
|
||||
#define KWARPSIZE 32
|
||||
template <typename T>
|
||||
void CalUnsortedSegmentMin(const T *input, const int *segment_ids, const int num_segments, size_t outer_size,
|
||||
size_t inner_size, T *output, cudaStream_t stream);
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNSORT_SEGMENT_MIN_H_
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue