change code to import APIs from mindspore.dataset rather than mindspore.dataset.engine

pull/10442/head
Xiao Tianci 4 years ago
parent 3ba3ffedd4
commit 31fed1a2f6

@ -14,7 +14,7 @@
# ============================================================================
"""generate dataloader and data processing entry"""
import mindspore.dataset.engine as de
import mindspore.dataset as ds
from src.utils import DistributedSampler
@ -32,7 +32,7 @@ def GetDataLoader(per_batch_size,
"""
centerface_gen = CenterfaceDataset(config=config, split=split)
sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy
de_dataset = de.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16)
de_dataset = ds.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16)
if group_size > 1:
num_parallel_workers = 24

@ -17,7 +17,7 @@ Data operations, will be used in train.py and eval.py
"""
import os
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \
shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \
@ -26,6 +26,7 @@ from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noi
import cv2
import numpy as np
cv2.setNumThreads(0)
image_height = None
@ -179,23 +180,24 @@ def create_dataset_train(mindrecord_file_pos, config):
rank_id = int(os.getenv("RANK_ID", '0'))
decode = C.Decode()
ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4,
num_shards=rank_size, shard_id=rank_id, shuffle=True)
ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4,
num_shards=rank_size, shard_id=rank_id, shuffle=True)
data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
augmentor = Augmentor(config.augment_severity, config.augment_prob)
operation = augmentor.process
ds = ds.map(operations=operation, input_columns=["image"],
num_parallel_workers=1, python_multiprocessing=True)
data_set = data_set.map(operations=operation, input_columns=["image"],
num_parallel_workers=1, python_multiprocessing=True)
##randomly augment half of samples to be negative samples
ds = ds.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], input_columns=["image", "label"],
num_parallel_workers=8, python_multiprocessing=True)
##for training double the dataset to accoun for positive and negative
ds = ds.repeat(2)
data_set = data_set.map(operations=[random_neg_with_rotate, unify_img_label, transform_image],
input_columns=["image", "label"],
num_parallel_workers=8, python_multiprocessing=True)
##for training double the data_set to accoun for positive and negative
data_set = data_set.repeat(2)
# apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(config.batch_size, drop_remainder=True)
return data_set
def resize_image(img, label):
@ -230,17 +232,18 @@ def create_dataset_eval(mindrecord_file_pos, config):
rank_id = int(os.getenv("RANK_ID", '0'))
decode = C.Decode()
ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1,
num_shards=rank_size, shard_id=rank_id, shuffle=False)
ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1,
num_shards=rank_size, shard_id=rank_id, shuffle=False)
data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
global image_height
global image_width
image_height = config.im_size_h
image_width = config.im_size_w
ds = ds.map(operations=resize_image, input_columns=["image", "label"], num_parallel_workers=config.work_nums,
python_multiprocessing=False)
data_set = data_set.map(operations=resize_image, input_columns=["image", "label"],
num_parallel_workers=config.work_nums,
python_multiprocessing=False)
# apply batch operations
ds = ds.batch(1, drop_remainder=True)
data_set = data_set.batch(1, drop_remainder=True)
return ds
return data_set

@ -16,7 +16,7 @@
import os
import numpy as np
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C
import mindspore.dataset.vision.c_transforms as vc
from PIL import Image, ImageFile
@ -105,7 +105,7 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i
dataset = IIIT5KDataset(dataset_path, "annotation.txt", config)
else:
raise ValueError(f"unsupported dataset name: {name}")
ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
image_trans = [
vc.Resize((config.image_height, config.image_width)),
vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]),
@ -114,8 +114,8 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i
label_trans = [
C.TypeCast(mstype.int32)
]
ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

@ -16,7 +16,7 @@
Data operations, will be used in train.py and eval.py
"""
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C
from src.config import config_gpu as cfg
@ -37,33 +37,33 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
dataset
"""
if group_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
# define map operations
if do_train:
trans = [
C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
C.RandomHorizontalFlip(prob=0.5),
C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
]
]
else:
trans = [
C.Decode(),
C.Resize(299),
C.CenterCrop(299)
]
]
trans += [
C.Rescale(1.0 / 255.0, 0.0),
C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
C.HWC2CHW()
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
# apply batch operations
ds = ds.batch(cfg.batch_size, drop_remainder=True)
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds
data_set = data_set.repeat(repeat_num)
return data_set

@ -17,7 +17,7 @@ create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
@ -44,10 +44,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = get_group_size()
if device_num == 1:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
# define map operations
trans = []
@ -66,15 +66,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
@ -99,10 +99,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = get_group_size()
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@ -127,16 +127,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def _get_rank_info():

@ -21,7 +21,7 @@ import numpy as np
from mindspore import Tensor
from mindspore.train.model import Model
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
@ -43,22 +43,22 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1):
rank_size = int(os.getenv("RANK_SIZE", '1'))
rank_id = int(os.getenv("RANK_ID", '0'))
if rank_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
elif config.platform == "GPU":
if do_train:
if config.run_distribute:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
elif config.platform == "CPU":
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
resize_height = config.image_height
resize_width = config.image_width
@ -83,19 +83,19 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1):
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
# apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
data_set = data_set.batch(config.batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def extract_features(net, dataset_path, config):
@ -121,5 +121,5 @@ def extract_features(net, dataset_path, config):
features = model.predict(Tensor(image))
np.save(features_path, features.asnumpy())
np.save(label_path, label)
print(f"Complete the batch {i+1}/{step_size}")
print(f"Complete the batch {i + 1}/{step_size}")
return step_size

@ -18,7 +18,7 @@ create train or eval dataset.
import os
from functools import partial
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.transforms.py_transforms as P2
@ -43,24 +43,24 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
rank_id = int(os.getenv("RANK_ID"))
columns_list = ['image', 'label']
if config.data_load_mode == "mindrecord":
load_func = partial(de.MindDataset, dataset_path, columns_list)
load_func = partial(ds.MindDataset, dataset_path, columns_list)
else:
load_func = partial(de.ImageFolderDataset, dataset_path)
load_func = partial(ds.ImageFolderDataset, dataset_path)
if do_train:
if rank_size == 1:
ds = load_func(num_parallel_workers=8, shuffle=True)
data_set = load_func(num_parallel_workers=8, shuffle=True)
else:
ds = load_func(num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = load_func(num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
else:
ds = load_func(num_parallel_workers=8, shuffle=False)
data_set = load_func(num_parallel_workers=8, shuffle=False)
elif device_target == "GPU":
if do_train:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
raise ValueError("Unsupported device_target.")
@ -69,7 +69,7 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
if do_train:
buffer_size = 20480
# apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# define map operations
decode_op = C.Decode()
@ -89,16 +89,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=16)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=16)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32):
@ -119,12 +119,12 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
rank_id = int(os.getenv("RANK_ID"))
if do_train:
if rank_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
else:
raise ValueError("Unsupported device target.")
@ -133,7 +133,7 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
if do_train:
buffer_size = 20480
# apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# define map operations
decode_op = P.Decode()
@ -152,12 +152,13 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
compose = P2.Compose(trans)
ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True)
data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8,
python_multiprocessing=True)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set

@ -16,7 +16,7 @@
create train or eval dataset.
"""
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
@ -38,12 +38,12 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
if do_train:
if run_distribute:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
raise ValueError("Unsupported device_target.")
@ -70,16 +70,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
# apply shuffle operations
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set

@ -16,7 +16,7 @@
Data operations, will be used in train.py and eval.py
"""
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C
@ -37,10 +37,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1):
rank = config.rank
group_size = config.group_size
if group_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
# define map operations
if do_train:
trans = [
@ -60,10 +60,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1):
C.HWC2CHW()
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums)
# apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
data_set = data_set.batch(config.batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds
data_set = data_set.repeat(repeat_num)
return data_set

@ -25,21 +25,24 @@ import pyclipper
from PIL import Image
from src.config import config
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.py_transforms as py_transforms
__all__ = ['train_dataset_creator', 'test_dataset_creator']
def get_img(img_path):
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
def get_imgs_names(root_dir):
img_paths = [i for i in os.listdir(root_dir)
if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']]
return img_paths
def get_bboxes(img, gt_path):
h, w = img.shape[0:2]
with open(gt_path, 'r', encoding='utf-8-sig') as f:
@ -58,6 +61,7 @@ def get_bboxes(img, gt_path):
tags.append(tag)
return np.array(bboxes), tags
def random_scale(img, min_size):
h, w = img.shape[0:2]
if max(h, w) > 1280:
@ -74,12 +78,14 @@ def random_scale(img, min_size):
img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2)
return img
def random_horizontal_flip(imgs):
if random.random() < 0.5:
for i, _ in enumerate(imgs):
imgs[i] = np.flip(imgs[i], axis=1).copy()
return imgs
def random_rotate(imgs):
max_angle = 10
angle = random.random() * 2 * max_angle - max_angle
@ -91,6 +97,7 @@ def random_rotate(imgs):
imgs[i] = img_rotation
return imgs
def random_crop(imgs, img_size):
h, w = imgs[0].shape[0:2]
th, tw = img_size
@ -118,21 +125,25 @@ def random_crop(imgs, img_size):
imgs[idx] = imgs[idx][i:i + th, j:j + tw]
return imgs
def scale(img, long_size=2240):
h, w = img.shape[0:2]
scale_long = long_size * 1.0 / max(h, w)
img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long)
return img
def dist(a, b):
return np.sqrt(np.sum((a - b) ** 2))
def perimeter(bbox):
peri = 0.0
for i in range(bbox.shape[0]):
peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]])
return peri
def shrink(bboxes, rate, max_shr=20):
rate = rate * rate
shrinked_bboxes = []
@ -158,6 +169,7 @@ def shrink(bboxes, rate, max_shr=20):
return np.array(shrinked_bboxes)
class TrainDataset:
def __init__(self):
self.is_transform = True
@ -260,6 +272,7 @@ class TrainDataset:
def __len__(self):
return len(self.all_img_paths)
def IC15_TEST_Generator():
ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/'
img_size = config.INFER_LONG_SIZE
@ -298,6 +311,7 @@ def IC15_TEST_Generator():
yield img, img_resized, img_name
class DistributedSampler():
def __init__(self, dataset, rank, group_size, shuffle=True, seed=0):
self.dataset = dataset
@ -324,18 +338,20 @@ class DistributedSampler():
def __len__(self):
return self.num_samplers
def train_dataset_creator(rank, group_size, shuffle=True):
cv2.setNumThreads(0)
dataset = TrainDataset()
sampler = DistributedSampler(dataset, rank, group_size, shuffle)
ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
sampler=sampler)
ds = ds.repeat(1)
ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
return ds
data_set = ds.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
sampler=sampler)
data_set = data_set.repeat(1)
data_set = data_set.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
return data_set
def test_dataset_creator():
ds = de.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name'])
ds = ds.shuffle(config.TEST_BUFFER_SIZE)
ds = ds.batch(1, drop_remainder=config.TEST_DROP_REMAINDER)
return ds
data_set = ds.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name'])
data_set = data_set.shuffle(config.TEST_BUFFER_SIZE)
data_set = data_set.batch(1, drop_remainder=config.TEST_DROP_REMAINDER)
return data_set

@ -29,7 +29,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common import set_seed
import mindspore.nn as nn
import mindspore.common.initializer as weight_init
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
from src.resnet_gpu_benchmark import resnet50 as resnet
from src.CrossEntropySmooth import CrossEntropySmooth
@ -45,19 +45,22 @@ parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dat
parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\
Or the ckpt model file when eval is True')
parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode')
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\
help='Compute data type fp32 or fp16: default fp16')
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \
help='Compute data type fp32 or fp16: default fp16')
args_opt = parser.parse_args()
set_seed(1)
class MyTimeMonitor(Callback):
def __init__(self, batch_size, sink_size):
super(MyTimeMonitor, self).__init__()
self.batch_size = batch_size
self.size = sink_size
def step_begin(self, run_context):
self.step_time = time.time()
def step_end(self, run_context):
cb_params = run_context.original_args()
loss = cb_params.net_outputs
@ -75,17 +78,18 @@ class MyTimeMonitor(Callback):
raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
cb_params.cur_epoch_num, cur_step_in_epoch))
step_mseconds = (time.time() - self.step_time) * 1000
fps = self.batch_size / step_mseconds *1000 * self.size
fps = self.batch_size / step_mseconds * 1000 * self.size
print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss),
"Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True)
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16",
device_num=1):
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True,
num_shards=device_num, shard_id=get_rank())
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True,
num_shards=device_num, shard_id=get_rank())
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
@ -113,14 +117,15 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
]
if dtype == "fp32":
trans.append(C.HWC2CHW())
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
if repeat_num > 1:
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return data_set
return ds
def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
lr_each_step = []
@ -136,6 +141,7 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per
lr_each_step = np.array(lr_each_step).astype(np.float32)
return lr_each_step
def train():
# set args
dev = "GPU"
@ -221,6 +227,7 @@ def train():
else:
model.train(epoch_size, dataset, callbacks=cb)
def eval_():
# set args
dev = "GPU"
@ -251,6 +258,7 @@ def eval_():
res = model.eval(dataset)
print("result:", res, "ckpt=", ckpt_dir)
if __name__ == '__main__':
if not args_opt.eval:
train()

@ -17,7 +17,7 @@ create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
@ -47,10 +47,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
else:
device_num = 1
if device_num == 1:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
# define map operations
trans = []
@ -69,15 +69,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
@ -106,10 +106,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = 1
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@ -134,16 +134,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
@ -171,10 +171,10 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
device_num = 1
rank_id = 1
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [0.475 * 255, 0.451 * 255, 0.392 * 255]
std = [0.275 * 255, 0.267 * 255, 0.278 * 255]
@ -198,15 +198,15 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
@ -234,10 +234,10 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
else:
device_num = 1
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [123.68, 116.78, 103.94]
std = [1.0, 1.0, 1.0]
@ -260,16 +260,16 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=12)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def _get_rank_info():

@ -18,7 +18,7 @@ create train or eval dataset.
import os
from functools import partial
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.py_transforms as P2
@ -53,14 +53,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
columns_list = ['image', 'label']
if config.data_load_mode == "mindrecord":
load_func = partial(de.MindDataset, dataset_path, columns_list)
load_func = partial(ds.MindDataset, dataset_path, columns_list)
else:
load_func = partial(de.ImageFolderDataset, dataset_path)
load_func = partial(ds.ImageFolderDataset, dataset_path)
if device_num == 1:
ds = load_func(num_parallel_workers=8, shuffle=True)
data_set = load_func(num_parallel_workers=8, shuffle=True)
else:
ds = load_func(num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = load_func(num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@ -85,16 +85,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
@ -121,12 +121,12 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe
if do_train:
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
image_size = 224
@ -147,12 +147,13 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe
trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op]
compose = P2.Compose(trans)
ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True)
data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8,
python_multiprocessing=True)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set

@ -17,7 +17,7 @@ create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
@ -47,10 +47,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
num_parallels = 4
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True,
num_shards=device_num, shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@ -75,16 +75,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def _get_rank_info():

@ -15,7 +15,7 @@
"""Data operations, will be used in train.py and eval.py"""
from src.config import config
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C
@ -36,10 +36,10 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0):
"""
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
# define map operations
if do_train:
trans = [
@ -59,8 +59,8 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0):
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8)
ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8)
data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
# apply batch operations
ds = ds.batch(config.batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(config.batch_size, drop_remainder=True)
return data_set

@ -19,7 +19,7 @@ import numpy as np
from src.config import config_gpu as cfg
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C
@ -46,10 +46,10 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
dataset
"""
if group_size == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
num_shards=group_size, shard_id=rank)
# define map operations
if do_train:
trans = [
@ -71,9 +71,9 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
# apply batch operations
ds = ds.batch(cfg.batch_size, drop_remainder=True)
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
return ds
return data_set

@ -17,7 +17,7 @@ create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path,
device_num = get_group_size()
if device_num == 1:
ds = de.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
data_set = ds.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
else:
ds = de.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
data_set = ds.Cifar10Dataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
# define map operations
if do_train:
@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path,
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
ds = ds.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
data_set = data_set.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def create_dataset_imagenet(dataset_path,
@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path,
device_num = get_group_size()
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
data_set = ds.ImageFolderDataset(dataset_path,
num_parallel_workers=8,
shuffle=True,
num_shards=device_num,
shard_id=rank_id)
image_size = 227
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path,
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
ds = ds.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
data_set = data_set.map(operations=type_cast_op,
input_columns="label",
num_parallel_workers=8)
data_set = data_set.map(operations=trans,
input_columns="image",
num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
data_set = data_set.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
data_set = data_set.repeat(repeat_num)
return ds
return data_set
def _get_rank_info():

@ -17,7 +17,7 @@ import os
import math as m
import numpy as np
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as c
import mindspore.dataset.vision.c_transforms as vc
from PIL import Image
@ -86,7 +86,7 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_
"""
dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target)
ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
image_trans = [
vc.Rescale(1.0 / 255.0, 0.0),
vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]),
@ -96,12 +96,12 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_
label_trans = [
c.TypeCast(mstype.int32)
]
ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
if device_target == 'Ascend':
ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8)
else:
ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8)
ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
data_set = data_set.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8)
data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

@ -16,10 +16,11 @@
Data operations, will be used in train.py and eval.py
"""
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C2
import mindspore.dataset.vision.c_transforms as C
def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
"""
create a train or eval dataset
@ -35,10 +36,10 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
dataset
"""
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank)
# define map operations
if do_train:
trans = [
@ -59,8 +60,8 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8)
ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8)
data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C
from mindspore import log as logger
from .config import cfg
@ -31,65 +31,67 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
for file_name in files:
if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name))
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
ori_dataset_size = data_set.get_dataset_size()
print('origin dataset size: ', ori_dataset_size)
type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations
ds = ds.batch(cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat count: {}".format(ds.get_repeat_count()))
return ds
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
return data_set
def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None, do_shuffle=True):
"""create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle)
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"],
shuffle=do_shuffle)
if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set
def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None, do_shuffle=True):
"""create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle)
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"],
shuffle=do_shuffle)
if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set
def generator_squad(data_features):
@ -102,20 +104,20 @@ def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, sche
"""create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32)
if is_training:
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "start_positions",
"end_positions", "unique_ids", "is_impossible"],
shuffle=do_shuffle)
ds = ds.map(operations=type_cast_op, input_columns="start_positions")
ds = ds.map(operations=type_cast_op, input_columns="end_positions")
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "start_positions",
"end_positions", "unique_ids", "is_impossible"],
shuffle=do_shuffle)
data_set = data_set.map(operations=type_cast_op, input_columns="start_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="end_positions")
else:
ds = de.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle,
column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"])
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.map(operations=type_cast_op, input_columns="unique_ids")
ds = ds.repeat(repeat_count)
data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle,
column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"])
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids")
data_set = data_set.repeat(repeat_count)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C
from mindspore import log as logger
from .bert_net_config import bert_net_cfg
@ -32,96 +32,96 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name))
data_files = sorted(data_files)
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=False)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
num_shards=device_num, shard_id=rank, shard_equal_rows=False)
ori_dataset_size = data_set.get_dataset_size()
print('origin dataset size: ', ori_dataset_size)
type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
# apply batch operations
ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeat count: {}".format(ds.get_repeat_count()))
return ds
data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True)
logger.info("data size: {}".format(data_set.get_dataset_size()))
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
return data_set
def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None):
"""create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply shuffle operation
buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set
def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
data_file_path=None, schema_file_path=None):
"""create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32)
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
if assessment_method == "Spearman_correlation":
type_cast_op_float = C.TypeCast(mstype.float32)
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
else:
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply shuffle operation
buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set
def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True):
"""create finetune or evaluation dataset"""
type_cast_op = C.TypeCast(mstype.int32)
if is_training:
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids",
"start_positions", "end_positions",
"unique_ids", "is_impossible"])
ds = ds.map(operations=type_cast_op, input_columns="start_positions")
ds = ds.map(operations=type_cast_op, input_columns="end_positions")
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids",
"start_positions", "end_positions",
"unique_ids", "is_impossible"])
data_set = data_set.map(operations=type_cast_op, input_columns="start_positions")
data_set = data_set.map(operations=type_cast_op, input_columns="end_positions")
else:
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"])
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
ds = ds.repeat(repeat_count)
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"])
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
data_set = data_set.repeat(repeat_count)
# apply shuffle operation
buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
data_set = data_set.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
data_set = data_set.batch(batch_size, drop_remainder=True)
return data_set

@ -22,7 +22,7 @@ import mindspore.ops.operations as P
from mindspore.common.tensor import Tensor
from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC
from mindspore import context
from src.fasttext_model import FastText
@ -73,15 +73,15 @@ class FastTextInferCell(nn.Cell):
def load_infer_dataset(batch_size, datafile):
"""data loader for infer"""
ds = de.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx'])
data_set = ds.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx'])
type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
ds = ds.map(operations=type_cast_op, input_columns="label_idx")
ds = ds.batch(batch_size=batch_size, drop_remainder=True)
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens")
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length")
data_set = data_set.map(operations=type_cast_op, input_columns="label_idx")
data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)
return ds
return data_set
def run_fasttext_infer():
"""run infer with FastText"""

@ -25,8 +25,10 @@ import spacy
from sklearn.feature_extraction import FeatureHasher
from mindspore.mindrecord import FileWriter
class FastTextDataPreProcess():
"""FastText data preprocess"""
def __init__(self, train_path,
test_file,
max_length,
@ -194,7 +196,6 @@ class FastTextDataPreProcess():
if self.text_less in sent_describe and self.text_greater in sent_describe:
sent_describe = self.str_html.sub('', sent_describe)
doc = spacy_nlp(sent_describe)
bows_token = [token.text for token in doc]
@ -222,7 +223,7 @@ class FastTextDataPreProcess():
def _get_bucket_length(self, x, bts):
x_len = len(x)
for index in range(1, len(bts)):
if bts[index-1] < x_len <= bts[index]:
if bts[index - 1] < x_len <= bts[index]:
return bts[index]
return bts[0]
@ -310,7 +311,6 @@ if __name__ == '__main__':
print("Writing test data to MindRecord file.....")
for k in args.test_bucket:
write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1)
print("All done.....")

@ -14,9 +14,10 @@
# ============================================================================
"""FastText data loader"""
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC
def load_dataset(dataset_path,
batch_size,
epoch_count=1,
@ -25,38 +26,40 @@ def load_dataset(dataset_path,
bucket=None,
shuffle=True):
"""dataset loader"""
def batch_per_bucket(bucket_length, input_file):
input_file = input_file +'/train_dataset_bs_' + str(bucket_length) + '.mindrecord'
input_file = input_file + '/train_dataset_bs_' + str(bucket_length) + '.mindrecord'
if not input_file:
raise FileNotFoundError("input file parameter must not be empty.")
ds = de.MindDataset(input_file,
columns_list=['src_tokens', 'src_tokens_length', 'label_idx'],
shuffle=shuffle,
num_shards=rank_size,
shard_id=rank_id,
num_parallel_workers=8)
ori_dataset_size = ds.get_dataset_size()
data_set = ds.MindDataset(input_file,
columns_list=['src_tokens', 'src_tokens_length', 'label_idx'],
shuffle=shuffle,
num_shards=rank_size,
shard_id=rank_id,
num_parallel_workers=8)
ori_dataset_size = data_set.get_dataset_size()
print(f"Dataset size: {ori_dataset_size}")
repeat_count = epoch_count
type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
ds = ds.map(operations=type_cast_op, input_columns="label_idx")
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens")
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length")
data_set = data_set.map(operations=type_cast_op, input_columns="label_idx")
data_set = data_set.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'],
output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag'])
data_set = data_set.batch(batch_size, drop_remainder=False)
data_set = data_set.repeat(repeat_count)
return data_set
ds = ds.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'],
output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag'])
ds = ds.batch(batch_size, drop_remainder=False)
ds = ds.repeat(repeat_count)
return ds
for i, _ in enumerate(bucket):
bucket_len = bucket[i]
ds_per = batch_per_bucket(bucket_len, dataset_path)
if i == 0:
ds = ds_per
data_set = ds_per
else:
ds = ds + ds_per
ds = ds.shuffle(ds.get_dataset_size())
ds.channel_name = 'fasttext'
data_set = data_set + ds_per
data_set = data_set.shuffle(data_set.get_dataset_size())
data_set.channel_name = 'fasttext'
return ds
return data_set

@ -15,7 +15,7 @@
"""Dataset loader to feed into model."""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as deC
@ -55,7 +55,7 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
print(f" | Loading {datafile}.")
if not is_translate:
ds = de.MindDataset(
data_set = ds.MindDataset(
input_files, columns_list=[
"src", "src_padding",
"prev_opt",
@ -64,18 +64,18 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
num_parallel_workers=8
)
ori_dataset_size = ds.get_dataset_size()
ori_dataset_size = data_set.get_dataset_size()
print(f" | Dataset size: {ori_dataset_size}.")
if shuffle:
ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20)
type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8)
ds = ds.rename(
data_set = data_set.rename(
input_columns=["src",
"src_padding",
"prev_opt",
@ -87,9 +87,9 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
"target_eos_ids",
"target_eos_mask"]
)
ds = ds.batch(batch_size, drop_remainder=drop_remainder)
data_set = data_set.batch(batch_size, drop_remainder=drop_remainder)
else:
ds = de.MindDataset(
data_set = ds.MindDataset(
input_files, columns_list=[
"src", "src_padding"
],
@ -97,23 +97,23 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
num_parallel_workers=8
)
ori_dataset_size = ds.get_dataset_size()
ori_dataset_size = data_set.get_dataset_size()
print(f" | Dataset size: {ori_dataset_size}.")
if shuffle:
ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20)
type_cast_op = deC.TypeCast(mstype.int32)
ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
ds = ds.rename(
data_set = data_set.rename(
input_columns=["src",
"src_padding"],
output_columns=["source_eos_ids",
"source_eos_mask"]
)
ds = ds.batch(batch_size, drop_remainder=drop_remainder)
data_set = data_set.batch(batch_size, drop_remainder=drop_remainder)
return ds
return data_set
def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool,

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save