!9491 GPU add trace for error/excpt

From: @VectorSL
Reviewed-by: @cristoval,@jjfeing
Signed-off-by: @jjfeing
pull/9491/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit ab0f23a90e

@ -56,11 +56,13 @@ class ArrayReduceGpuKernel : public GpuKernel {
if (all_match_) {
MS_LOG(DEBUG)
<< "The corresponding dimensions of the input and output tensors all match. No need to call cuDNN kernel.";
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, input_addr, inputs[0]->size, cudaMemcpyDeviceToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(output_addr, input_addr, inputs[0]->size, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync failed in ArrayReduceGpuKernel::Launch.");
} else {
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_, &alpha,
inputA_descriptor_, input_addr, &beta, outputC_descriptor_, output_addr),
"cudnnReduceTensor failed.");
@ -68,6 +70,7 @@ class ArrayReduceGpuKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
InitResource();
data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
@ -140,34 +143,35 @@ class ArrayReduceGpuKernel : public GpuKernel {
}
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
"cudnnDestroyReduceTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(inputA_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(outputC_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}
protected:
void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
"cudnnCreateReduceTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_),
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&inputA_descriptor_),
"cudnnCreateTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&outputC_descriptor_),
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&outputC_descriptor_),
"cudnnCreateTensorDescriptor failed.");
}
void InitSizeLists() override {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
"cudnnGetTensorSizeInBytes failed.");
input_size_list_.push_back(input_size_);
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_),
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_),
"cudnnGetTensorSizeInBytes failed.");
output_size_list_.push_back(output_size_);
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_,
&workspace_size_),
"cudnnGetReductionWorkspaceSize failed.");
@ -186,6 +190,7 @@ class ArrayReduceGpuKernel : public GpuKernel {
}
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, reduce_tensor_op_, CUDNN_DATA_FLOAT, nan_prop_,
reduce_indices_, CUDNN_32BIT_INDICES),
"cudnnSetReduceTensorDescriptor failed");
@ -199,11 +204,12 @@ class ArrayReduceGpuKernel : public GpuKernel {
if (input_shape.size() <= split_dim) {
ShapeNdTo4d(input_shape, &inputA);
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, SizeToInt(inputA[0]),
SizeToInt(inputA[1]), SizeToInt(inputA[2]), SizeToInt(inputA[3])),
"cudnnSetTensor4dDescriptor failed");
} else {
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_);
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_, kernel_node_);
for (auto dim : input_shape) {
inputA.emplace_back(SizeToInt(dim));
}
@ -213,10 +219,10 @@ class ArrayReduceGpuKernel : public GpuKernel {
outputC_shape.resize(input_shape.size(), 1);
if (outputC_shape.size() <= split_dim) {
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
kernel_node_, cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
"cudnnSetTensor4dDescriptor failed");
} else {
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_);
}
for (auto dim : inputA) {
@ -239,11 +245,12 @@ class ArrayReduceGpuKernel : public GpuKernel {
if (outputC_shape.size() <= split_dim) {
ShapeNdTo4d(outputC_shape, &outputC);
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, SizeToInt(outputC[0]),
SizeToInt(outputC[1]), SizeToInt(outputC[2]), SizeToInt(outputC[3])),
"cudnnSetTensor4dDescriptor failed");
} else {
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_);
for (auto dim : outputC_shape) {
outputC.emplace_back(SizeToInt(dim));
}

@ -49,10 +49,12 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
for (size_t i = 0; i < inputs.size(); i++) {
inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"ConcatV2 opt cudaMemcpyAsync inputs failed");
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"ConcatV2 opt cudaMemcpyAsync length on axis failed");
ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
@ -60,6 +62,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
if (!CheckParam(kernel_node)) {
return false;
}

@ -41,6 +41,7 @@ class DynamicShapeGpuKernel : public GpuKernel {
S *output_device_address = GetDeviceAddress<S>(outputs, 0);
size_t prev_node_output_shape_size = prev_node_output_shape_.size() * sizeof(S);
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_,
cudaMemcpyAsync(output_device_address, prev_node_output_shape_.data(), prev_node_output_shape_size,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync prev_node_output_shape failed");
@ -49,6 +50,7 @@ class DynamicShapeGpuKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
size_t input_count = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_count != 1) {
MS_LOG(EXCEPTION) << input_count << " arguments were provided, but DynamicShapeGpuKernel expects 1.";

@ -51,10 +51,12 @@ class GatherNdGpuFwdKernel : public GpuKernel {
if (!memcpy_flag_) {
const size_t strides_len = sizeof(S) * batch_strides_.size();
const size_t indices_len = sizeof(S) * batch_indices_.size();
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_batch_strides_, &batch_strides_[0], strides_len,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(dev_batch_strides_, &batch_strides_[0], strides_len,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync failed in GatherNdGpuFwdKernel::Launch.");
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_batch_indices_, &batch_indices_[0], indices_len,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(dev_batch_indices_, &batch_indices_[0], indices_len,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync failed in GatherNdGpuFwdKernel::Launch.");
memcpy_flag_ = true;
@ -65,6 +67,7 @@ class GatherNdGpuFwdKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
InitResource();
memcpy_flag_ = false;
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);

@ -43,10 +43,12 @@ class GatherV2GpuFwdKernel : public GpuKernel {
T *output_addr = GetDeviceAddress<T>(outputs, 0);
if (is_dynamic_shape_) {
int64_t *axis_device_address = GetDeviceAddress<int64_t>(inputs, 2); // only get this if in dynamic mode
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&axis_, axis_device_address, sizeof(int64_t), cudaMemcpyDeviceToHost,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(&axis_, axis_device_address, sizeof(int64_t), cudaMemcpyDeviceToHost,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync axis_ failed");
CHECK_CUDA_RET_WITH_EXCEPT(cudaDeviceSynchronize(), "cudaDeviceSyncFailed - GatherV2 - in dynamic mode");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(),
"cudaDeviceSyncFailed - GatherV2 - in dynamic mode");
Reshape();
}
auto input_dim1 = input_shapes_[IntToSize(axis_)];
@ -55,6 +57,7 @@ class GatherV2GpuFwdKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
InitResource();
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num == 3) {

@ -41,7 +41,8 @@ class PackGpuFwdKernel : public GpuKernel {
for (size_t i = 0; i < inputs.size(); i++) {
inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_array, // NOLINT
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(inputs_array, // NOLINT
inputs_host_.get(), sizeof(T *) * input_num_, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Pack opt cudaMemcpyAsync inputs failed");
@ -50,6 +51,7 @@ class PackGpuFwdKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
if (!CheckParam(kernel_node)) {
return false;
}

@ -41,13 +41,15 @@ class ScatterAddKernel : public GpuKernel {
T *updates = GetDeviceAddress<T>(inputs, 2);
T *output = GetDeviceAddress<T>(outputs, 0);
CalScatterAdd(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync output failed");
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterAdd needs 3 inputs.";

@ -61,16 +61,19 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
if (!memcpy_flag_) {
const size_t indices_len = sizeof(S) * vec_indices_stride_.size();
const size_t vec_work_len = sizeof(S) * vec_work_shape_.size();
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(indices_stride_, &vec_indices_stride_[0], indices_len,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(indices_stride_, &vec_indices_stride_[0], indices_len,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpy failed in ScatterNdGpuFwdKernel::Launch.");
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(work_shape_, &vec_work_shape_[0], vec_work_len, cudaMemcpyHostToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(work_shape_, &vec_work_shape_[0], vec_work_len, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpy failed in ScatterNdGpuFwdKernel::Launch.");
memcpy_flag_ = true;
}
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_,
cudaMemsetAsync(output, static_cast<T>(0.0), output_size_, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemSet failed in ScatterNdGpuFwdKernel::Launch.");
@ -83,6 +86,7 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
memcpy_flag_ = false;
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 2) {

@ -41,13 +41,15 @@ class ScatterUpdateKernel : public GpuKernel {
T *updates = GetDeviceAddress<T>(inputs, 2);
T *output = GetDeviceAddress<T>(outputs, 0);
CalScatterUpdate(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync output failed");
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterUpdate needs 3 inputs.";

@ -41,7 +41,8 @@ class SplitGpuFwdKernel : public GpuKernel {
for (size_t i = 0; i < outputs.size(); i++) {
outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"Split opt cudaMemcpyAsync outputs failed");
SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
@ -50,6 +51,7 @@ class SplitGpuFwdKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
axis_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "axis"));
if (axis_ < 0) {
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);

@ -40,10 +40,12 @@ class TransposeGpuFwdKernel : public GpuKernel {
T *output = GetDeviceAddress<T>(outputs, 0);
size_t *input_shape = GetDeviceAddress<size_t>(workspace, 0);
size_t *input_axis = GetDeviceAddress<size_t>(workspace, 1);
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync input_shape failed");
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_axis, &input_axis_[0], workspace_size_, cudaMemcpyHostToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(input_axis, &input_axis_[0], workspace_size_, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync input_axis failed");
size_t size = input_size_ / sizeof(T);
@ -52,6 +54,7 @@ class TransposeGpuFwdKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but transpose needs 1 input.";

@ -60,7 +60,7 @@ class UniqueGpuKernel : public GpuKernel {
}
void PostExecute() override {
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr_)),
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr_)),
"cudaStreamSynchronized failed");
std::vector<TypeId> type_ids;
std::vector<std::vector<size_t>> shapes;

@ -41,7 +41,8 @@ class UnpackGpuFwdKernel : public GpuKernel {
for (size_t i = 0; i < outputs.size(); i++) {
outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_array, // NOLINT
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(outputs_array, // NOLINT
outputs_host_.get(), sizeof(T *) * output_num_, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Unpack opt cudaMemcpyAsync outputs failed");
@ -50,6 +51,7 @@ class UnpackGpuFwdKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
if (!CheckParam(kernel_node)) {
return false;
}

@ -44,7 +44,8 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
int *indices_addr = GetDeviceAddress<int>(inputs, 1);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(output_addr, std::numeric_limits<T>::min(), outputs[0]->size,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemsetAsync(output_addr, std::numeric_limits<T>::min(), outputs[0]->size,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemSet Failed");
CalUnsortedSegmentMax(input_addr, indices_addr, num_segments_, outer_size_, inner_size_, output_addr,
@ -53,6 +54,7 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(input_shapes);
if (is_null_input_) {

@ -44,7 +44,7 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
T *output_addr = GetDeviceAddress<T>(outputs, 0);
CHECK_CUDA_RET_WITH_EXCEPT(
cudaMemsetAsync(output_addr, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
kernel_node_, cudaMemsetAsync(output_addr, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemSet Failed");
UnsortedSegmentSum(input_dim0_, input_dim1_, output_dim0_, output_dim1_, input_addr, indices_addr, output_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
@ -52,6 +52,7 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(input_shapes);
if (is_null_input_) {

@ -34,10 +34,12 @@ class RecvGpuKernel : public GpuKernel {
bool Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
void *) override {
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamWaitEvent(wait_stream_, wait_event_, 0), "Waiting cuda event failed.");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamWaitEvent(wait_stream_, wait_event_, 0),
"Waiting cuda event failed.");
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
wait_stream_ = reinterpret_cast<cudaStream_t>(GetAttr<uintptr_t>(kernel_node, "wait_event_stream"));
wait_event_ = reinterpret_cast<cudaEvent_t>(GetAttr<uintptr_t>(kernel_node, "wait_event"));
InitSizeLists();

@ -34,10 +34,12 @@ class SendGpuKernel : public GpuKernel {
bool Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
void *) override {
CHECK_CUDA_RET_WITH_EXCEPT(cudaEventRecord(record_event_, record_stream_), "Recording cuda event failed.");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaEventRecord(record_event_, record_stream_),
"Recording cuda event failed.");
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
record_stream_ = reinterpret_cast<cudaStream_t>(GetAttr<uintptr_t>(kernel_node, "record_event_stream"));
record_event_ = reinterpret_cast<cudaEvent_t>(GetAttr<uintptr_t>(kernel_node, "record_event"));
InitSizeLists();

@ -44,6 +44,7 @@ const std::vector<size_t> &DatasetIteratorKernel::GetOutputSizeList() const { re
const std::vector<size_t> &DatasetIteratorKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
kernel_node_ = kernel_node;
queue_name_ = GetAttr<std::string>(kernel_node, "shared_name");
std::vector<std::vector<int>> shapes;
std::vector<std::vector<int64_t>> shapes_me = GetAttr<const std::vector<std::vector<int64_t>>>(kernel_node, "shapes");
@ -143,13 +144,14 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
for (size_t i = 0; i < output_size_list_.size(); i++) {
void *output_addr = GetDeviceAddress<void>(outputs, i);
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(output_addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream)),
"Cuda Memcpy Failed");
addr = reinterpret_cast<unsigned char *>(addr) + output_size_list_[i];
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)),
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)),
"cudaStreamSynchronize failed");
(void)GpuBufferMgr::GetInstance().Pop(handle_);
return true;

@ -73,6 +73,7 @@ class GpuKernel : public KernelMod {
protected:
virtual void InitResource() {}
virtual void InitSizeLists() = 0;
CNodePtr kernel_node_;
template <typename T>
inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
@ -201,7 +202,7 @@ class GpuKernel : public KernelMod {
// set the tensor descriptor for cudnn/cublas
void CudnnSetTensorNdDescriptor(const std::vector<size_t> &shape, cudnnTensorDescriptor_t descriptor,
cudnnDataType_t data_type) {
cudnnDataType_t data_type, const CNodePtr &node) {
if (shape.size() < 3) {
MS_EXCEPTION(ValueError) << "cudnnSetTensorNdDescriptor don't support" << shape.size() << "D.";
}
@ -224,7 +225,7 @@ class GpuKernel : public KernelMod {
stride[i] = stride[i + 1] * SizeToInt(shape[i + 1]);
}
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride),
CHECK_CUDNN_RET_WITH_EXCEPT(node, cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride),
"cudnnSetTensorNdDescriptor failed");
delete[] dim;

@ -69,19 +69,22 @@ class AddNGpuFwdKernel : public GpuKernel {
ElewiseArith(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, work_addr, work_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr,
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr,
&(i > 0 ? alpha : beta), input_descriptor_, work_addr),
"cudnnAddTensor failed");
}
}
if (work_addr != output_addr) {
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice,
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Addn cudaMemcpyAsync outputs failed");
}
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
InitResource();
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
@ -111,11 +114,13 @@ class AddNGpuFwdKernel : public GpuKernel {
}
auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0);
if (input_format == kOpFormat_NHWC) {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_,
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_,
SizeToInt(input_shape.size()), dimA),
"cudnnSetTensorNdDescriptor failed");
} else {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
SizeToInt(input_shape.size()), dimA),
"cudnnSetTensorNdDescriptor failed");
}
@ -124,17 +129,19 @@ class AddNGpuFwdKernel : public GpuKernel {
}
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_descriptor_),
"cudnnDestroyTensorDescriptor failed");
}
protected:
void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_descriptor_), "cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_descriptor_),
"cudnnCreateTensorDescriptor failed");
}
void InitSizeLists() override {
if (!is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(input_descriptor_, &input_size_),
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(input_descriptor_, &input_size_),
"cudnnGetTensorSizeInBytes failed");
}
for (size_t i = 0; i < num_input_; i++) {

@ -57,7 +57,8 @@ class BiasAddGpuKernel : public GpuKernel {
try {
const float alpha = 1;
const float beta = 0;
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnOpTensor(cudnn_handle_, op_desc_, &alpha, x_desc_, x_addr, &alpha, b_desc_,
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnOpTensor(cudnn_handle_, op_desc_, &alpha, x_desc_, x_addr, &alpha, b_desc_,
b_addr, &beta, x_desc_, output_addr),
"cudnnOpTensor failed");
} catch (const std::exception &e) {
@ -66,6 +67,7 @@ class BiasAddGpuKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
InitResource();
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
@ -99,12 +101,15 @@ class BiasAddGpuKernel : public GpuKernel {
auto input_device_format = AnfAlgo::GetInputFormat(kernel_node, 0);
auto cudnn_cal_format = (input_device_format == kOpFormat_NHWC) ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnSetTensorNdDescriptorEx(x_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), x_dims.get()),
"cudnnSetTensorNdDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnSetTensorNdDescriptorEx(b_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), b_dims.get()),
"cudnnSetTensorNdDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnSetOpTensorDescriptor(op_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN),
"cudnnSetOpTensorDescriptor failed");
@ -113,22 +118,30 @@ class BiasAddGpuKernel : public GpuKernel {
}
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyOpTensorDescriptor(op_desc_),
"cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(b_desc_),
"cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_),
"cudnnDestroyOpTensorDescriptor failed");
}
protected:
void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&b_desc_), "cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateOpTensorDescriptor(&op_desc_), "cudnnCreateOpTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_),
"cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&b_desc_),
"cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateOpTensorDescriptor(&op_desc_),
"cudnnCreateOpTensorDescriptor failed");
}
void InitSizeLists() override {
size_t x_size, b_size;
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &x_size), "cudnnGetTensorSizeInBytes failed.");
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(b_desc_, &b_size), "cudnnGetTensorSizeInBytes failed.");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &x_size),
"cudnnGetTensorSizeInBytes failed.");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(b_desc_, &b_size),
"cudnnGetTensorSizeInBytes failed.");
input_size_list_.push_back(x_size);
input_size_list_.push_back(b_size);
output_size_list_.push_back(x_size);

@ -45,9 +45,11 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
T *dx1 = GetDeviceAddress<T>(outputs, 0);
T *dx2 = GetDeviceAddress<T>(outputs, 1);
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx1, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemsetAsync(dx1, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemSet Failed");
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx2, 0, outputs[1]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
cudaMemsetAsync(dx2, 0, outputs[1]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemSet Failed");
if (need_broadcast_) {
BroadcastGrad(x1_shape_[0], x1_shape_[1], x1_shape_[2], x1_shape_[3], x2_shape_[0], x2_shape_[1], x2_shape_[2],
@ -61,6 +63,7 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
GetOpType(kernel_node);
auto shape1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);

@ -48,19 +48,22 @@ class CastAllGpuFwdKernel : public GpuKernel {
S **outputs_dev = GetDeviceAddress<S *>(workspace, 1);
size_t *size_dev = GetDeviceAddress<size_t>(workspace, 2);
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_,
cudaMemcpyAsync(inputs_dev, in_addr.get(), sizeof(T *) * num_input_, cudaMemcpyHostToDevice, stream),
"cudaMemCPY failed")
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_,
cudaMemcpyAsync(outputs_dev, out_addr.get(), sizeof(S *) * num_input_, cudaMemcpyHostToDevice, stream),
"cudaMemCPY failed")
CHECK_CUDA_RET_WITH_EXCEPT(
cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream),
kernel_node_, cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream),
"cudaMemCPY failed")
CastAllKernel(inputs_dev, outputs_dev, max_, num_input_, size_dev, stream);
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
num_input_ = GetAttr<size_t>(kernel_node, "n");
size_ = std::make_unique<size_t[]>(num_input_);
for (size_t i = 0; i < num_input_; i++) {

@ -54,18 +54,21 @@ class CholeskyGpuKernel : public GpuKernel {
h_array[i] = input1_addr + i * lda_ * m_;
h_identity[i] = output_addr + i * ldb_ * m_;
CHECK_CUDA_RET_WITH_ERROR(
kernel_node_,
cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
}
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUSOLVER_RET_WITH_EXCEPT(
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
"cusolver cholesky batched Fail");
TriangleMatrixCopy(input1_addr, output_addr, uplo, outputs[0]->size / sizeof(T), ldb_, m_,
reinterpret_cast<cudaStream_t>(stream_ptr));
@ -79,14 +82,16 @@ class CholeskyGpuKernel : public GpuKernel {
Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUSOLVER_RET_WITH_EXCEPT(
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
"cusolver cholesky batched Fail");
TriangleMatrixCopy(d_batch_input_addr, output_addr, uplo, outputs[0]->size / sizeof(T), ldb_, m_,
reinterpret_cast<cudaStream_t>(stream_ptr));
@ -95,6 +100,7 @@ class CholeskyGpuKernel : public GpuKernel {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);

@ -51,6 +51,7 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
@ -113,21 +114,25 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
h_array[i] = input1_addr + i * lda_ * m_;
h_identity[i] = output_addr + i * ldb_ * m_;
CHECK_CUDA_RET_WITH_ERROR(
kernel_node_,
cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
}
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUSOLVER_RET_WITH_EXCEPT(
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
"cusolver cholesky batched Fail");
float alpha = 1;
CHECK_CUBLAS_RET_WITH_EXCEPT(
kernel_node_,
cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha,
d_array_addr, lda_, d_identity_addr, ldb_, batch_),
"cublas trsm batched Fail");
@ -147,17 +152,20 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cuda memcopy Fail");
CHECK_CUSOLVER_RET_WITH_EXCEPT(
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
"cusolver cholesky batched Fail");
float alpha = 1;
CHECK_CUBLAS_RET_WITH_EXCEPT(
kernel_node_,
cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha,
d_array_addr, lda_, d_identity_addr, ldb_, batch_),
"cublas trsm batched Fail");

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save