Fix CI Alarms

pull/9329/head
Jonathan Yan 4 years ago
parent 6b9e402790
commit d7bb5cd058

@ -22,10 +22,12 @@ MS_REG_GPU_KERNEL_TWO(
GatherV2,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
GatherV2GpuFwdKernel, float, int)
MS_REG_GPU_KERNEL_TWO(
GatherV2,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
GatherV2GpuFwdKernel, half, int)
MS_REG_GPU_KERNEL_TWO(GatherV2,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
@ -33,6 +35,7 @@ MS_REG_GPU_KERNEL_TWO(GatherV2,
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeFloat32),
GatherV2GpuFwdKernel, float, int)
MS_REG_GPU_KERNEL_TWO(GatherV2,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
@ -40,14 +43,17 @@ MS_REG_GPU_KERNEL_TWO(GatherV2,
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeFloat16),
GatherV2GpuFwdKernel, half, int)
MS_REG_GPU_KERNEL_TWO(
SparseGatherV2,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
GatherV2GpuFwdKernel, float, int)
MS_REG_GPU_KERNEL_TWO(
SparseGatherV2,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
GatherV2GpuFwdKernel, half, int)
MS_REG_GPU_KERNEL_TWO(SparseGatherV2,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
@ -55,6 +61,7 @@ MS_REG_GPU_KERNEL_TWO(SparseGatherV2,
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeFloat32),
GatherV2GpuFwdKernel, float, int)
MS_REG_GPU_KERNEL_TWO(SparseGatherV2,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)

@ -18,7 +18,6 @@
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(
UnsortedSegmentSum,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),

@ -85,10 +85,7 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
void DatasetIteratorKernel::InitSizeLists() { return; }
bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream) {
void *addr = nullptr;
size_t len = 0;
bool DatasetIteratorKernel::ReadDevice(void **addr, size_t *len) {
uint64_t start_time_stamp = 0;
uint32_t queue_size = 0;
@ -98,7 +95,7 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
start_time_stamp = profiling_op_->GetTimeStamp();
queue_size = GpuBufferMgr::GetInstance().Size(handle_);
}
auto ret = GpuBufferMgr::GetInstance().Front(handle_, &addr, &len);
auto ret = GpuBufferMgr::GetInstance().Front(handle_, addr, len);
if (ret == device::SUCCESS) {
if (profiling_enable_) {
uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
@ -129,7 +126,16 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
MS_LOG(ERROR) << "Get data failed, errcode " << ret;
return false;
}
return true;
}
bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream) {
void *addr = nullptr;
size_t len = 0;
if (!ReadDevice(&addr, &len)) {
return false;
}
if (total_bytes_ != len) {
MS_LOG(ERROR) << "Dataset front error. read: " << len << ", expect: " << total_bytes_ << ", ";
return false;

@ -43,6 +43,7 @@ class DatasetIteratorKernel : public GpuKernel {
void InitSizeLists() override;
private:
bool ReadDevice(void **addr, size_t *len);
std::string queue_name_;
unsigned int handle_;
size_t total_bytes_;

@ -107,10 +107,23 @@ bool GpuKernelFactory::ReducePrecision(
return GpuKernelFactory::SearchRegistered(kernel_name, builder->Build());
}
void GpuKernelFactory::CheckSM(const KernelBuildInfo *kernel_info, const size_t &input_index) {
const int major_sm = GET_MAJOR_SM;
const bool check_sm = mindspore::device::gpu::CudaCommon::GetInstance().check_sm();
if (check_sm && major_sm < RECOMMEND_SM && kernel_info->GetInputDeviceType(input_index) == kNumberTypeFloat16) {
if (major_sm < MINIUM_SM) {
MS_LOG(EXCEPTION) << "Half precision ops can be used on Devices which computing capacity is >= " << MINIUM_SM
<< ", but the current device's computing capacity is " << major_sm;
}
MS_LOG(WARNING) << "It is recommended to use devices with a computing capacity >= " << RECOMMEND_SM
<< ", but the current device's computing capacity is " << major_sm;
mindspore::device::gpu::CudaCommon::GetInstance().set_check_sm(false);
}
}
std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name,
const KernelBuildInfo *kernel_info) {
auto iter = map_kernel_name_to_creater_.find(kernel_name);
const int marjor_sm = GET_MAJOR_SM;
if (map_kernel_name_to_creater_.end() == iter) {
MS_LOG(INFO) << "Not registered GPU kernel: op[" << kernel_name << "]!";
return std::make_pair(false, 0);
@ -127,16 +140,7 @@ std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &
auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize();
// data type matching check of all input parameters of kernel
for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) {
const bool check_sm = mindspore::device::gpu::CudaCommon::GetInstance().check_sm();
if (check_sm && marjor_sm < RECOMMEND_SM && kernel_info->GetInputDeviceType(input_index) == kNumberTypeFloat16) {
if (marjor_sm < MINIUM_SM) {
MS_LOG(EXCEPTION) << "Half precision ops can be used on Devices which computing capacity is >= " << MINIUM_SM
<< ", but the current device's computing capacity is " << marjor_sm;
}
MS_LOG(WARNING) << "It is recommended to use devices with a computing capacity >= " << RECOMMEND_SM
<< ", but the current device's computing capacity is " << marjor_sm;
mindspore::device::gpu::CudaCommon::GetInstance().set_check_sm(false);
}
GpuKernelFactory::CheckSM(kernel_info, input_index);
if (kernel_info->GetInputDeviceType(input_index) !=
(iter->second)[attr_index].first.GetInputAttr(input_index % attr_size).first) {
flag = false;

@ -57,6 +57,7 @@ class GpuKernelFactory {
GpuKernelFactory &operator=(const GpuKernelFactory &);
std::pair<bool, size_t> GpuKernelAttrCheck(const std::string &kernel_name, const KernelBuildInfo *kernel_info);
void CheckSM(const KernelBuildInfo *kernel_info, const size_t &input_index);
bool CheckIOParam(const std::string &kernel_name, const KernelBuildInfo *kernel_info,
std::vector<std::pair<KernelAttr, GpuKernelCreater>> *iter_second, size_t attr_index);
// map to maintain kernel and creater, KernelAttr object and creater must be registered as a pair.

@ -159,8 +159,8 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
size_t output_size = gradient_size.ori_h * gradient_size.ori_w * unit_size;
output_size_list_.push_back(output_size);
size_t workspace_size_ = 0;
workspace_size_ = gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * unit_size;
size_t workspace_size_ =
gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * unit_size;
workspace_size_list_.push_back(workspace_size_);
if (gradient_size.need_convert) {

@ -56,51 +56,24 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
switch (nccl_kernel_type_) {
case NCCL_ALL_REDUCE: {
auto all_reduce_funcptr =
reinterpret_cast<AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce"));
MS_EXCEPTION_IF_NULL(all_reduce_funcptr);
CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
nccl_data_type_, nccl_reduce_type_, stream, group_name_),
"ncclAllReduce failed");
LaunchAllReduce(inputs, outputs, stream_ptr);
break;
}
case NCCL_ALL_GATHER: {
auto all_gather_funcptr =
reinterpret_cast<AllGather>(dlsym(const_cast<void *>(collective_handle_), "AllGather"));
MS_EXCEPTION_IF_NULL(all_gather_funcptr);
CHECK_NCCL_RET_WITH_EXCEPT(
(*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_),
"ncclAllGather failed");
LaunchAllGather(inputs, outputs, stream_ptr);
break;
}
case NCCL_REDUCE_SCATTER: {
auto reduce_scatter_funcptr =
reinterpret_cast<ReduceScatter>(dlsym(const_cast<void *>(collective_handle_), "ReduceScatter"));
MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr);
CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
nccl_data_type_, nccl_reduce_type_, stream, group_name_),
"ncclReduceScatter failed");
LaunchReduceScatter(inputs, outputs, stream_ptr);
break;
}
case NCCL_BROADCAST: {
auto broadcast_funcptr =
reinterpret_cast<Broadcast>(dlsym(const_cast<void *>(collective_handle_), "Broadcast"));
MS_EXCEPTION_IF_NULL(broadcast_funcptr);
for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) {
input_addr = GetDeviceAddress<T>(inputs, i);
output_addr = GetDeviceAddress<T>(outputs, i);
CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T),
nccl_data_type_, root_, stream, group_name_),
"ncclBroadcast failed");
}
LaunchBroadcast(inputs, outputs, stream_ptr);
break;
}
default: {
@ -153,6 +126,59 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
void InitSizeLists() override { return; }
private:
void LaunchAllReduce(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
void *stream_ptr) {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
auto all_reduce_funcptr = reinterpret_cast<AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce"));
MS_EXCEPTION_IF_NULL(all_reduce_funcptr);
CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_,
nccl_reduce_type_, stream, group_name_),
"ncclAllReduce failed");
}
void LaunchAllGather(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
void *stream_ptr) {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
auto all_gather_funcptr = reinterpret_cast<AllGather>(dlsym(const_cast<void *>(collective_handle_), "AllGather"));
MS_EXCEPTION_IF_NULL(all_gather_funcptr);
CHECK_NCCL_RET_WITH_EXCEPT(
(*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_),
"ncclAllGather failed");
}
void LaunchReduceScatter(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
void *stream_ptr) {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
auto reduce_scatter_funcptr =
reinterpret_cast<ReduceScatter>(dlsym(const_cast<void *>(collective_handle_), "ReduceScatter"));
MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr);
CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
nccl_data_type_, nccl_reduce_type_, stream, group_name_),
"ncclReduceScatter failed");
}
void LaunchBroadcast(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
void *stream_ptr) {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
auto broadcast_funcptr = reinterpret_cast<Broadcast>(dlsym(const_cast<void *>(collective_handle_), "Broadcast"));
MS_EXCEPTION_IF_NULL(broadcast_funcptr);
for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) {
input_addr = GetDeviceAddress<T>(inputs, i);
output_addr = GetDeviceAddress<T>(outputs, i);
CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T),
nccl_data_type_, root_, stream, group_name_),
"ncclBroadcast failed");
}
}
void InferCommType(const CNodePtr &kernel_node) {
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
auto iter = kNcclTypeMap.find(kernel_name);

@ -44,25 +44,76 @@ class CtcLossGpuKernel : public GpuKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
LaunchInit(inputs, workspace, outputs);
LaunchFirstHalf(inputs, workspace, outputs, stream_ptr);
LaunchSecondHalf(inputs, workspace, outputs, stream_ptr);
return true;
}
bool Init(const CNodePtr &kernel_node) override {
InitResource();
auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
if (probs_shape.size() != 3) {
MS_LOG(EXCEPTION) << "probs dims: " << probs_shape.size() << " not support.";
}
probs_dims_[0] = probs_shape[0];
probs_dims_[1] = probs_shape[1];
probs_dims_[2] = probs_shape[2];
auto indice_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
auto labels_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
if (labels_dims.size() != 1) {
MS_LOG(EXCEPTION) << "labels dims: " << labels_dims.size() << " not support.";
}
if (indice_dims.size() != 2) {
MS_LOG(EXCEPTION) << "labels indice dims: " << indice_dims.size() << " not support.";
}
label_size_ = sizeof(int);
for (auto i : labels_dims) {
label_size_ *= i;
}
label_indice_size_ = sizeof(int64_t);
for (auto i : indice_dims) {
label_indice_size_ *= i;
}
auto squence_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
squence_lengths_size_ = squence_length_dims[0] * sizeof(int);
preprocess_collapse_repeated_ = GetAttr<bool>(kernel_node, "preprocess_collapse_repeated");
ctc_merge_repeated_ = GetAttr<bool>(kernel_node, "ctc_merge_repeated");
ignore_longer_outputs_than_inputs_ = GetAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");
InitSizeLists();
return true;
}
protected:
void LaunchInit(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
probs = GetDeviceAddress<T>(inputs, 0);
label_indices = GetDeviceAddress<int64_t>(inputs, 1);
label_values = GetDeviceAddress<int>(inputs, 2);
sequence_length = GetDeviceAddress<int>(inputs, 3);
costs = GetDeviceAddress<T>(outputs, 0);
grads = GetDeviceAddress<T>(outputs, 1);
softmax_probs = GetDeviceAddress<T>(workspace, 0);
cum_labels_length = GetDeviceAddress<int>(workspace, 1);
label_squence_length = GetDeviceAddress<int>(workspace, 2);
label_value_sp = GetDeviceAddress<int>(workspace, 3);
label_value_pcr = GetDeviceAddress<int>(workspace, 4);
prob_num = GetDeviceAddress<T>(workspace, 5);
precum_labels_length = GetDeviceAddress<int>(workspace, 6);
max_labels_length = GetDeviceAddress<int>(workspace, 7);
numclass = SizeToInt(probs_dims_[2]);
batch = SizeToInt(probs_dims_[1]);
max_time = SizeToInt(probs_dims_[0]);
max_sequence = 0;
max_labels_length_host = 0;
batch_label = 0;
label_value_with_blank = nullptr;
log_alpha_b = nullptr;
log_beta_b = nullptr;
}
void LaunchFirstHalf(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
const T *probs = GetDeviceAddress<T>(inputs, 0);
const int64_t *label_indices = GetDeviceAddress<int64_t>(inputs, 1);
const int *label_values = GetDeviceAddress<int>(inputs, 2);
const int *sequence_length = GetDeviceAddress<int>(inputs, 3);
T *costs = GetDeviceAddress<T>(outputs, 0);
T *grads = GetDeviceAddress<T>(outputs, 1);
T *softmax_probs = GetDeviceAddress<T>(workspace, 0);
int *cum_labels_length = GetDeviceAddress<int>(workspace, 1);
int *label_squence_length = GetDeviceAddress<int>(workspace, 2);
int *label_value_sp = GetDeviceAddress<int>(workspace, 3);
int *label_value_pcr = GetDeviceAddress<int>(workspace, 4);
T *prob_num = GetDeviceAddress<T>(workspace, 5);
int *precum_labels_length = GetDeviceAddress<int>(workspace, 6);
int *max_labels_length = GetDeviceAddress<int>(workspace, 7);
int numclass = SizeToInt(probs_dims_[2]);
int batch = SizeToInt(probs_dims_[1]);
int max_time = SizeToInt(probs_dims_[0]);
int max_sequence = 0;
CalculateMaxSequence(sequence_length, max_labels_length, batch, stream);
CHECK_CUDA_RET_WITH_EXCEPT(
cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream),
@ -73,11 +124,7 @@ class CtcLossGpuKernel : public GpuKernel {
}
InnerSoftMax(probs, softmax_probs, sequence_length, max_time, batch, numclass, stream);
MemsetForWS(label_value_pcr, cum_labels_length, label_squence_length, costs, grads, stream);
int max_labels_length_host = 0;
int batch_label = 0;
int *label_value_with_blank = nullptr;
T *log_alpha_b = nullptr;
T *log_beta_b = nullptr;
CalculatePreLength(label_squence_length, precum_labels_length, cum_labels_length, max_labels_length, label_indices,
batch, label_size_ / sizeof(int), stream);
CHECK_CUDA_RET_WITH_EXCEPT(
@ -97,8 +144,14 @@ class CtcLossGpuKernel : public GpuKernel {
cudaMemcpyAsync(&max_labels_length_host, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream),
"cudaMemcpyAsync failed.");
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
}
void LaunchSecondHalf(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
int SOffSet = 2 * max_labels_length_host + 1;
int log_prob_size = batch * SOffSet * max_time;
if (!ignore_longer_outputs_than_inputs_ && max_labels_length_host > max_time) {
MS_LOG(EXCEPTION) << "output size is greater than input size.";
}
@ -124,43 +177,8 @@ class CtcLossGpuKernel : public GpuKernel {
ignore_longer_outputs_than_inputs_, stream);
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
FreeMem(label_value_with_blank, log_alpha_b, log_beta_b);
return true;
}
bool Init(const CNodePtr &kernel_node) override {
InitResource();
auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
if (probs_shape.size() != 3) {
MS_LOG(EXCEPTION) << "probs dims: " << probs_shape.size() << " not support.";
}
probs_dims_[0] = probs_shape[0];
probs_dims_[1] = probs_shape[1];
probs_dims_[2] = probs_shape[2];
auto indice_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
auto labels_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
if (labels_dims.size() != 1) {
MS_LOG(EXCEPTION) << "labels dims: " << labels_dims.size() << " not support.";
}
if (indice_dims.size() != 2) {
MS_LOG(EXCEPTION) << "labels indice dims: " << indice_dims.size() << " not support.";
}
label_size_ = sizeof(int);
for (auto i : labels_dims) {
label_size_ *= i;
}
label_indice_size_ = sizeof(int64_t);
for (auto i : indice_dims) {
label_indice_size_ *= i;
}
auto squence_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
squence_lengths_size_ = squence_length_dims[0] * sizeof(int);
preprocess_collapse_repeated_ = GetAttr<bool>(kernel_node, "preprocess_collapse_repeated");
ctc_merge_repeated_ = GetAttr<bool>(kernel_node, "ctc_merge_repeated");
ignore_longer_outputs_than_inputs_ = GetAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");
InitSizeLists();
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T));
input_size_list_.push_back(label_indice_size_);
@ -226,6 +244,31 @@ class CtcLossGpuKernel : public GpuKernel {
bool ctc_merge_repeated_;
bool ignore_longer_outputs_than_inputs_;
T kLogZero_ = -std::numeric_limits<T>::infinity();
// Heap parameter
T *probs;
int64_t *label_indices;
int *label_values;
int *sequence_length;
T *costs;
T *grads;
T *softmax_probs;
int *cum_labels_length;
int *label_squence_length;
int *label_value_sp;
int *label_value_pcr;
T *prob_num;
int *precum_labels_length;
int *max_labels_length;
int numclass;
int batch;
int max_time;
int max_sequence;
int max_labels_length_host;
int batch_label;
int *label_value_with_blank;
T *log_alpha_b;
T *log_beta_b;
}; // namespace kernel
} // namespace kernel
} // namespace mindspore

@ -82,17 +82,11 @@ class L2NormalizeGpuKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
InitResource();
data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 1 inputs.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output.";
if (!CheckIONumber(kernel_node)) {
return false;
}
int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size());
@ -176,6 +170,19 @@ class L2NormalizeGpuKernel : public GpuKernel {
}
private:
bool CheckIONumber(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 1 inputs.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output.";
return false;
}
return true;
}
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
"cudnnDestroyReduceTensorDescriptor failed.");

@ -104,20 +104,31 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
return true;
}
bool CheckInputShape(const std::vector<size_t> &output_shape) {
for (auto &shape : input_shape_list_) {
if (output_shape != shape) {
MS_LOG(EXCEPTION) << "Input shape and output shape should be same!";
}
}
is_null_input_ = CHECK_NULL_INPUT(input_shape_list_[0]);
if (is_null_input_) {
MS_LOG(WARNING) << "L2NormalizeGPUKernel input is null";
InitSizeLists();
return false;
}
if (input_shape_list_[0].size() > MAX_DIMS) {
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
}
return true;
}
bool Init(const CNodePtr &kernel_node) override {
InitResource();
data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != INPUT_SIZE) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 3 inputs.";
if (!CheckIONumber(kernel_node)) {
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output.";
return false;
}
int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size());
int axis = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
axis_ = axis < 0 ? (axis + input_dim_length) : axis;
@ -128,10 +139,8 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i));
}
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
for (auto &shape : input_shape_list_) {
if (output_shape != shape) {
MS_LOG(EXCEPTION) << "Input shape and output shape should be same!";
}
if (!CheckInputShape(output_shape)) {
return true;
}
output_size_ = sizeof(T);
@ -139,16 +148,6 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
output_size_ *= dim;
}
is_null_input_ = CHECK_NULL_INPUT(input_shape_list_[0]);
if (is_null_input_) {
MS_LOG(WARNING) << "L2NormalizeGPUKernel input is null";
InitSizeLists();
return true;
}
if (input_shape_list_[0].size() > MAX_DIMS) {
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
}
std::vector<size_t> output_reduce_shape = output_shape;
output_reduce_shape[axis_] = 1;
@ -173,6 +172,19 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
}
protected:
bool CheckIONumber(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != INPUT_SIZE) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 3 inputs.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output.";
return false;
}
return true;
}
void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),

@ -53,14 +53,7 @@ class PadGpuFwdKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but Pad needs 1 input.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but Pad needs 1 output.";
if (!CheckIONumber(kernel_node)) {
return false;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
@ -114,6 +107,20 @@ class PadGpuFwdKernel : public GpuKernel {
}
private:
bool CheckIONumber(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but Pad needs 1 input.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but Pad needs 1 output.";
return false;
}
return true;
}
size_t shape_size_;
size_t temp;
std::vector<std::vector<int>> paddings; // list of paddings (tuple of tuple in python)

@ -76,11 +76,9 @@ class PoolingGradGpuKernel : public GpuKernel {
"cudnnPoolingBackward failed");
return true;
}
bool Init(const CNodePtr &kernel_node) override {
InitResource();
if (!CheckParam(kernel_node)) {
return false;
}
bool InitShape(const CNodePtr &kernel_node, int *dimA, int *strideAin, int *dimAy, int *strideAiny, int *dimAdy,
int *strideAdy, int *dimAout, int *strideAout) {
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto input_mask = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
auto dout_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
@ -95,9 +93,25 @@ class PoolingGradGpuKernel : public GpuKernel {
if (is_null_input_) {
MS_LOG(WARNING) << "PoolingGradGpuKernel input is null.";
InitSizeLists();
return true;
return false;
}
SetNCHW(input_shape, &n_, &c_, &old_height_, &old_width_, data_format);
SetDimA(input_shape, dimA, 4, data_format);
SetStrideA(input_shape, strideAin, 4, data_format);
SetDimA(input_mask, dimAy, 4, data_format);
SetStrideA(input_mask, strideAiny, 4, data_format);
SetDimA(dout_shape, dimAdy, 4, data_format);
SetStrideA(dout_shape, strideAdy, 4, data_format);
SetDimA(output_shape, dimAout, 4, data_format);
SetStrideA(output_shape, strideAout, 4, data_format);
return true;
}
bool Init(const CNodePtr &kernel_node) override {
InitResource();
if (!CheckParam(kernel_node)) {
return false;
}
const int nbDims = 4;
int dimA[4];
int strideAin[4];
@ -107,14 +121,9 @@ class PoolingGradGpuKernel : public GpuKernel {
int strideAdy[4];
int dimAout[4];
int strideAout[4];
SetDimA(input_shape, dimA, 4, data_format);
SetStrideA(input_shape, strideAin, 4, data_format);
SetDimA(input_mask, dimAy, 4, data_format);
SetStrideA(input_mask, strideAiny, 4, data_format);
SetDimA(dout_shape, dimAdy, 4, data_format);
SetStrideA(dout_shape, strideAdy, 4, data_format);
SetDimA(output_shape, dimAout, 4, data_format);
SetStrideA(output_shape, strideAout, 4, data_format);
if (!InitShape(kernel_node, dimA, strideAin, dimAy, strideAiny, dimAdy, strideAdy, dimAout, strideAout)) {
return true;
}
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny),
"cudnnSetTensor4dDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
@ -129,6 +138,7 @@ class PoolingGradGpuKernel : public GpuKernel {
InitSizeLists();
return true;
}
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
"cudnnDestroyPoolingDescriptor failed");

Loading…
Cancel
Save