From d9bd7fe04e33250d0af9978a93e3fe99f94d5f6b Mon Sep 17 00:00:00 2001 From: VectorSL Date: Tue, 17 Nov 2020 17:47:14 +0800 Subject: [PATCH] gpu fix padding and dataset_heleper --- .../gpu/nn/conv2d_gpu_kernel.h | 25 ++++++------------- .../gpu/nn/conv2d_grad_filter_gpu_kernel.h | 19 ++++++-------- .../gpu/nn/conv2d_grad_input_gpu_kernel.h | 19 ++++++-------- mindspore/ccsrc/pipeline/jit/pipeline.cc | 7 +++--- mindspore/ccsrc/pipeline/jit/resource.h | 7 +++++- mindspore/train/dataset_helper.py | 2 ++ .../scripts/run_standalone_train_gpu.sh | 7 +++--- model_zoo/official/cv/resnet/README.md | 10 ++++---- .../cv/resnet/gpu_resnet_benchmark.py | 2 +- 9 files changed, 42 insertions(+), 56 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h index c5b88da02d..173a84aecb 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h @@ -76,7 +76,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { const float alpha = 1; const float beta = 0; - if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) { + if (use_pad_) { T *padded_addr = GetDeviceAddress(workspace, 1); if (data_format_ == kOpFormat_NHWC) { CalPadNHWC(padded_size_ / sizeof(T), input_addr, n_, old_height_, old_width_, c_, old_height_ + pad_height_, @@ -133,23 +133,18 @@ class Conv2dGpuFwdKernel : public GpuKernel { [](const int64_t &value) { return static_cast(value); }); pad_height_ = pad_list[0]; pad_width_ = pad_list[2]; - auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]); + use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3])); pad_mode_ = GetAttr(kernel_node, "pad_mode"); SetStrideAndDilation(kernel_node); cudnnTensorDescriptor_t input_descriptor_real = nullptr; int padA[2]; int strideA[2] = {stride_[2], stride_[3]}; int dilaA[2] = {dilation_[2], dilation_[3]}; - if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) { + if (use_pad_) { pad_height_ = pad_list[0] + pad_list[1]; pad_width_ = pad_list[2] + pad_list[3]; pad_top_ = pad_list[0]; pad_left_ = pad_list[2]; - - // if use_pad_ == true, using zero padding in advance, else using the default cudnn pad. - if (pad_height_ % 2 == 0 && pad_width_ % 2 == 0) { - use_pad_ = false; - } int dimA[4]; int strideApadded[4]; if (data_format_ == kOpFormat_NCHW || data_format_ == kOpFormat_DEFAULT) { @@ -165,18 +160,12 @@ class Conv2dGpuFwdKernel : public GpuKernel { } CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded), "cudnnSetTensor4dDescriptor failed"); - - if (use_pad_) { - padA[0] = 0; - padA[1] = 0; - } else { - padA[0] = pad_top_; - padA[1] = pad_left_; - } + padA[0] = 0; + padA[1] = 0; CHECK_CUDNN_RET_WITH_EXCEPT( cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolutionNdDescriptor failed"); - input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_; + input_descriptor_real = padded_desc_; } else { if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) { pad_height_ = 0; @@ -232,7 +221,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { input_size_list_.push_back(input_size_); input_size_list_.push_back(filter_size_); output_size_list_.push_back(output_size_); - if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) { + if (use_pad_ && !is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT( cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, padded_desc_, filter_desc_, conv_desc_, output_desc_, conv_algorithm_, &workspace_size_), diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h index 49f1124411..f9b916dd2f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h @@ -78,7 +78,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { const float alpha = 1; const float beta = 0; - if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) { + if (use_pad_) { T *padded = GetDeviceAddress(workspace, 1); if (data_format_ == kOpFormat_NHWC) { CalPadNHWC(padded_size_ / sizeof(T), x, n_, old_height_, old_width_, c_, old_height_ + pad_height_, @@ -136,14 +136,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { [](const int64_t &value) { return static_cast(value); }); pad_height_ = pad_list[0]; pad_width_ = pad_list[2]; - auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]); + use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3])); pad_mode_ = GetAttr(kernel_node, "pad_mode"); SetStrideAndDilation(kernel_node); cudnnTensorDescriptor_t x_desc_real = nullptr; int padA[2]; int strideA[2] = {stride_[0], stride_[1]}; int dilaA[2] = {dilation_[2], dilation_[3]}; - if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) { + if (use_pad_) { pad_height_ = pad_list[0] + pad_list[1]; pad_width_ = pad_list[2] + pad_list[3]; pad_top_ = pad_list[0]; @@ -167,17 +167,12 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { CHECK_CUDNN_RET_WITH_EXCEPT( cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), "cudnnSetTensor4dDescriptor failed"); - if (use_pad_) { - padA[0] = 0; - padA[1] = 0; - } else { - padA[0] = pad_top_; - padA[1] = pad_left_; - } + padA[0] = 0; + padA[1] = 0; CHECK_CUDNN_RET_WITH_EXCEPT( cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolutionNdDescriptor failed"); - x_desc_real = use_pad_ ? padded_descriptor_ : x_desc_; + x_desc_real = padded_descriptor_; } else { if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) { pad_height_ = 0; @@ -231,7 +226,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { input_size_list_.push_back(input_size_); output_size_list_.push_back(output_size_); - if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) { + if (use_pad_ && !is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT( cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast(&padded_size_)), "cudnnGetTensorSizeInBytes failed"); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h index 9608204c4e..1504312b92 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h @@ -77,7 +77,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { } const float alpha = 1; - if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) { + if (use_pad_) { T *padded = GetDeviceAddress(workspace, 1); CHECK_CUDNN_RET_WITH_EXCEPT( @@ -139,14 +139,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { [](const int64_t &value) { return static_cast(value); }); pad_height_ = pad_list[0]; pad_width_ = pad_list[2]; - auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]); + use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3])); pad_mode_ = GetAttr(kernel_node, "pad_mode"); SetStrideAndDilation(kernel_node); cudnnTensorDescriptor_t dx_desc_real = nullptr; int padA[2]; int strideA[2] = {stride_[0], stride_[1]}; int dilaA[2] = {dilation_[2], dilation_[3]}; - if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) { + if (use_pad_) { pad_height_ = pad_list[0] + pad_list[1]; pad_width_ = pad_list[2] + pad_list[3]; pad_top_ = pad_list[0]; @@ -170,17 +170,12 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { CHECK_CUDNN_RET_WITH_EXCEPT( cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), "cudnnSetTensor4dDescriptor failed"); - if (use_pad_) { - padA[0] = 0; - padA[1] = 0; - } else { - padA[0] = pad_top_; - padA[1] = pad_left_; - } + padA[0] = 0; + padA[1] = 0; CHECK_CUDNN_RET_WITH_EXCEPT( cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), "cudnnSetConvolutionNdDescriptor failed"); - dx_desc_real = use_pad_ ? padded_descriptor_ : dx_desc_; + dx_desc_real = padded_descriptor_; } else { if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) { pad_height_ = 0; @@ -233,7 +228,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { input_size_list_.push_back(w_size_); output_size_list_.push_back(output_size_); - if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) { + if (use_pad_ && !is_null_input_) { CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_), "cudnnGetTensorSizeInBytes failed"); diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc index cbb2521fab..1793d7da78 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.cc +++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc @@ -662,8 +662,9 @@ void Pipeline::Run() { auto manager = func_graph->manager(); size_t graph_nums = manager->func_graphs().size(); if (graph_nums == 1) { - resource_->set_gpu_loopsink_flag(true); - MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true."; + int64_t sinksize = ConfigManager::GetInstance().iter_num(); + resource_->set_gpu_loopsink(true, sinksize); + MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true,set loopsink size to " << sinksize; } } } @@ -834,7 +835,7 @@ py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) { } // Set loopsink size for each phase. bool is_loopsink = info_[phase_s]->resource->gpu_loopsink_flag(); - int64_t sinksize = ConfigManager::GetInstance().iter_num(); + int64_t sinksize = info_[phase_s]->resource->gpu_loopsink_size(); ConfigManager::GetInstance().set_gpu_loopsink_size(is_loopsink ? sinksize : 1); // If target is not gpu or is loopsink, keep vmloop 1. bool g = (MsContext::GetInstance()->get_param(MS_CTX_DEVICE_TARGET) == kGPUDevice); diff --git a/mindspore/ccsrc/pipeline/jit/resource.h b/mindspore/ccsrc/pipeline/jit/resource.h index c5c879d46c..2389ecd1bc 100644 --- a/mindspore/ccsrc/pipeline/jit/resource.h +++ b/mindspore/ccsrc/pipeline/jit/resource.h @@ -74,8 +74,12 @@ class Resource : public ResourceBase { const abstract::AbstractBasePtrList &args_spec() const { return args_spec_; } void set_args_spec(const abstract::AbstractBasePtrList &args_spec) { args_spec_ = args_spec; } - void set_gpu_loopsink_flag(const bool &flag) { gpu_loopsink_flag_ = flag; } + void set_gpu_loopsink(const bool &flag, const int64_t size) { + gpu_loopsink_flag_ = flag; + gpu_loopsink_size_ = size; + } bool gpu_loopsink_flag() { return gpu_loopsink_flag_; } + int64_t gpu_loopsink_size() { return gpu_loopsink_size_; } // Reclaim resource and clear the cache. // ExecutorPy::Compile() can be called multiple times, so cache @@ -89,6 +93,7 @@ class Resource : public ResourceBase { py::object input_; bool is_cleaned_; bool gpu_loopsink_flag_{false}; + int64_t gpu_loopsink_size_{1}; }; using ResourcePtr = std::shared_ptr; diff --git a/mindspore/train/dataset_helper.py b/mindspore/train/dataset_helper.py index 8e0e00b82a..47483020d2 100644 --- a/mindspore/train/dataset_helper.py +++ b/mindspore/train/dataset_helper.py @@ -129,6 +129,8 @@ class DatasetHelper: Validator.check_is_int(sink_size) if sink_size < -1 or sink_size == 0: raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size)) + if sink_size == -1: + sink_size = dataset.get_dataset_size() if dataset_sink_mode: if context.get_context("enable_ge"): diff --git a/model_zoo/official/cv/alexnet/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/alexnet/scripts/run_standalone_train_gpu.sh index 4f9d581328..e50d18c1e9 100644 --- a/model_zoo/official/cv/alexnet/scripts/run_standalone_train_gpu.sh +++ b/model_zoo/official/cv/alexnet/scripts/run_standalone_train_gpu.sh @@ -14,15 +14,14 @@ # limitations under the License. # ============================================================================ # an simple tutorial as follows, more parameters can be setting -if [ $# != 3 ] +if [ $# != 2 ] then - echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH] [DEVICE_ID]" + echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH]" exit 1 fi export DATASET_NAME=$1 export DATA_PATH=$2 -export DEVICE_ID=$3 python train.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH \ - --device_id=$DEVICE_ID --device_target="GPU" > log 2>&1 & + --device_target="GPU" > log 2>&1 & diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index eee3619de3..4da5058eaa 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -345,11 +345,11 @@ epoch: 5 step: 5004, loss is 3.3501816 ``` # ========START RESNET50 GPU BENCHMARK======== -step time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182 -step time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993 -step time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594 -step time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937 -step time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013 +Epoch time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182 +Epoch time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993 +Epoch time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594 +Epoch time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937 +Epoch time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013 ... ``` ## [Evaluation Process](#contents) diff --git a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py index 2c6127cb36..7090a3f14b 100644 --- a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py +++ b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py @@ -53,7 +53,7 @@ class MyTimeMonitor(Callback): def step_end(self, run_context): step_mseconds = (time.time() - self.step_time) * 1000 fps = self.batch_size / step_mseconds *1000 * self.size - print("step time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ") + print("Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ") def pad(image): zeros = np.zeros([224, 224, 1], dtype=np.uint8)