From ff0cdd0bd3e54a6d64e3bf95ec9dfc3d255fc9e7 Mon Sep 17 00:00:00 2001 From: VectorSL Date: Sat, 21 Nov 2020 18:12:25 +0800 Subject: [PATCH] gpu benchmrak add fp32 --- mindspore/ops/_op_impl/akg/gpu/equal.py | 1 + .../ops/_op_impl/akg/gpu/greater_equal.py | 1 + mindspore/ops/_op_impl/akg/gpu/lessequal.py | 1 + mindspore/ops/_op_impl/akg/gpu/notequal.py | 1 + model_zoo/official/cv/resnet/README.md | 2 +- .../cv/resnet/gpu_resnet_benchmark.py | 22 +++++++----- .../scripts/run_gpu_resnet_benchmark.sh | 17 +++++++--- .../cv/resnet/src/resnet_gpu_benchmark.py | 34 ++++++++++++++----- 8 files changed, 56 insertions(+), 23 deletions(-) diff --git a/mindspore/ops/_op_impl/akg/gpu/equal.py b/mindspore/ops/_op_impl/akg/gpu/equal.py index 26816d09a8..1f4c03a999 100644 --- a/mindspore/ops/_op_impl/akg/gpu/equal.py +++ b/mindspore/ops/_op_impl/akg/gpu/equal.py @@ -23,6 +23,7 @@ equal_op_info = AkgGpuRegOp("Equal") \ .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ + .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ diff --git a/mindspore/ops/_op_impl/akg/gpu/greater_equal.py b/mindspore/ops/_op_impl/akg/gpu/greater_equal.py index 1072dfd0d5..7c1443b057 100644 --- a/mindspore/ops/_op_impl/akg/gpu/greater_equal.py +++ b/mindspore/ops/_op_impl/akg/gpu/greater_equal.py @@ -23,6 +23,7 @@ greater_equal_op_info = AkgGpuRegOp("GreaterEqual") \ .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ + .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ diff --git a/mindspore/ops/_op_impl/akg/gpu/lessequal.py b/mindspore/ops/_op_impl/akg/gpu/lessequal.py index df5065f441..e176f99b50 100644 --- a/mindspore/ops/_op_impl/akg/gpu/lessequal.py +++ b/mindspore/ops/_op_impl/akg/gpu/lessequal.py @@ -23,6 +23,7 @@ lessequal_op_info = AkgGpuRegOp("LessEqual") \ .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ + .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ diff --git a/mindspore/ops/_op_impl/akg/gpu/notequal.py b/mindspore/ops/_op_impl/akg/gpu/notequal.py index c70b641926..59ed641396 100644 --- a/mindspore/ops/_op_impl/akg/gpu/notequal.py +++ b/mindspore/ops/_op_impl/akg/gpu/notequal.py @@ -22,6 +22,7 @@ notequal_op_info = AkgGpuRegOp("NotEqual") \ .output(0, "output") \ .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ + .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index 4da5058eaa..7b0f3047eb 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -277,7 +277,7 @@ sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATA sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] # gpu benchmark example -sh run_gpu_resnet_benchmark.sh [IMAGENET_DATASET_PATH] [BATCH_SIZE](optional) [DEVICE_NUM](optional) +sh run_gpu_resnet_benchmark.sh [IMAGENET_DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) ``` #### Running parameter server mode training diff --git a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py index 7090a3f14b..418ce5e73c 100644 --- a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py +++ b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py @@ -39,6 +39,8 @@ parser.add_argument('--epoch_size', type=str, default="2", help='Epoch_size: def parser.add_argument('--print_per_steps', type=str, default="20", help='Print loss and time per steps: default 20') parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dataset path') +parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\ + help='Compute data type fp32 or fp16: default fp16') args_opt = parser.parse_args() set_seed(1) @@ -60,7 +62,7 @@ def pad(image): output = np.concatenate((image, zeros), axis=2) return output -def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU"): +def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16"): ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) image_size = 224 @@ -81,9 +83,11 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" C.CenterCrop(image_size), C.Normalize(mean=mean, std=std), ] - + if dtype == "fp32": + trans.append(C.HWC2CHW()) ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=4) - ds = ds.map(operations=pad, input_columns="image", num_parallel_workers=4) + if dtype == "fp16": + ds = ds.map(operations=pad, input_columns="image", num_parallel_workers=4) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation @@ -112,6 +116,7 @@ if __name__ == '__main__': epoch_size = int(args_opt.epoch_size) total_batch = int(args_opt.batch_size) print_per_steps = int(args_opt.print_per_steps) + compute_type = str(args_opt.dtype).lower() # init context context.set_context(mode=context.GRAPH_MODE, device_target=dev, save_graphs=False) @@ -122,14 +127,14 @@ if __name__ == '__main__': # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, - batch_size=total_batch, target=dev) + batch_size=total_batch, target=dev, dtype=compute_type) step_size = dataset.get_dataset_size() if (print_per_steps > step_size or print_per_steps < 1): print("Arg: print_per_steps should lessequal to dataset_size ", step_size) print("Change to default: 20") print_per_steps = 20 # define net - net = resnet(class_num=1001) + net = resnet(class_num=1001, dtype=compute_type) # init weight for _, cell in net.cells_and_names(): @@ -163,10 +168,11 @@ if __name__ == '__main__': loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) + model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # Mixed precision - model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, - amp_level="O2", keep_batchnorm_fp32=False) - + if compute_type == "fp16": + model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, + amp_level="O2", keep_batchnorm_fp32=False) # define callbacks time_cb = MyTimeMonitor(total_batch, print_per_steps) loss_cb = LossMonitor() diff --git a/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh b/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh index f235eb6e5f..c138900c98 100644 --- a/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh +++ b/model_zoo/official/cv/resnet/scripts/run_gpu_resnet_benchmark.sh @@ -14,10 +14,11 @@ # limitations under the License. # ============================================================================ -if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] +if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] then - echo "Usage: sh run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DEVICE_NUM](optional)" - echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train 256 8" + echo "Usage: sh run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\ + [DEVICE_NUM](optional)" + echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train 256 FP16 8" exit 1 fi @@ -44,6 +45,12 @@ fi if [ $# == 3 ] then - mpirun --allow-run-as-root -n $3 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ - --dataset_path=$DATAPATH --batch_size=$2 + python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True --dtype=$3 \ + --dataset_path=$DATAPATH --batch_size=$2 fi + +if [ $# == 4 ] +then + mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ + --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 +fi \ No newline at end of file diff --git a/model_zoo/official/cv/resnet/src/resnet_gpu_benchmark.py b/model_zoo/official/cv/resnet/src/resnet_gpu_benchmark.py index 4fb343ddcd..f323a2682b 100644 --- a/model_zoo/official/cv/resnet/src/resnet_gpu_benchmark.py +++ b/model_zoo/official/cv/resnet/src/resnet_gpu_benchmark.py @@ -20,6 +20,13 @@ from mindspore.ops import operations as P from mindspore.common.tensor import Tensor from scipy.stats import truncnorm +format_ = "NHWC" +# tranpose shape to NCHW, default init is NHWC. +def _trans_shape(shape, shape_format): + if shape_format == "NCHW": + return (shape[0], shape[3], shape[1], shape[2]) + return shape + def _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): fan_in = in_channel * kernel_size * kernel_size scale = 1.0 @@ -37,30 +44,33 @@ def _weight_variable(shape, factor=0.01): def _conv3x3(in_channel, out_channel, stride=1): weight_shape = (out_channel, 3, 3, in_channel) + weight_shape = _trans_shape(weight_shape, format_) weight = _weight_variable(weight_shape) return nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride, - padding=1, pad_mode='pad', weight_init=weight, data_format="NHWC") + padding=1, pad_mode='pad', weight_init=weight, data_format=format_) def _conv1x1(in_channel, out_channel, stride=1): weight_shape = (out_channel, 1, 1, in_channel) + weight_shape = _trans_shape(weight_shape, format_) weight = _weight_variable(weight_shape) return nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride, - padding=0, pad_mode='pad', weight_init=weight, data_format="NHWC") + padding=0, pad_mode='pad', weight_init=weight, data_format=format_) def _conv7x7(in_channel, out_channel, stride=1): weight_shape = (out_channel, 7, 7, in_channel) + weight_shape = _trans_shape(weight_shape, format_) weight = _weight_variable(weight_shape) return nn.Conv2d(in_channel, out_channel, kernel_size=7, stride=stride, - padding=3, pad_mode='pad', weight_init=weight, data_format="NHWC") + padding=3, pad_mode='pad', weight_init=weight, data_format=format_) def _bn(channel): return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, gamma_init=1, beta_init=0, - moving_mean_init=0, moving_var_init=1, data_format="NHWC") + moving_mean_init=0, moving_var_init=1, data_format=format_) def _bn_last(channel): return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, gamma_init=0, beta_init=0, - moving_mean_init=0, moving_var_init=1, data_format="NHWC") + moving_mean_init=0, moving_var_init=1, data_format=format_) def _fc(in_channel, out_channel): weight_shape = (out_channel, in_channel) @@ -165,10 +175,13 @@ class ResNet(nn.Cell): if not len(layer_nums) == len(in_channels) == len(out_channels) == 4: raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!") - self.conv1 = _conv7x7(4, 64, stride=2) + input_data_channel = 4 + if format_ == "NCHW": + input_data_channel = 3 + self.conv1 = _conv7x7(input_data_channel, 64, stride=2) self.bn1 = _bn(64) self.relu = P.ReLU() - self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same", data_format="NHWC") + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same", data_format=format_) self.layer1 = self._make_layer(block, layer_nums[0], in_channel=in_channels[0], @@ -190,7 +203,7 @@ class ResNet(nn.Cell): out_channel=out_channels[3], stride=strides[3]) - self.avg_pool = P.AvgPool(7, 1, data_format="NHWC") + self.avg_pool = P.AvgPool(7, 1, data_format=format_) self.flatten = nn.Flatten() self.end_point = _fc(out_channels[3], num_classes) @@ -237,7 +250,7 @@ class ResNet(nn.Cell): return out -def resnet50(class_num=1001): +def resnet50(class_num=1001, dtype="fp16"): """ Get ResNet50 neural network. @@ -250,6 +263,9 @@ def resnet50(class_num=1001): Examples: >>> net = resnet50(1001) """ + global format_ + if dtype == "fp32": + format_ = "NCHW" return ResNet(ResidualBlock, [3, 4, 6, 3], [64, 256, 512, 1024],