minddata iterator output ms_tensor

pull/5801/head
xiefangqi 4 years ago
parent a778868a5a
commit 9b3c33e157

@ -394,7 +394,7 @@ class Dataset:
logger.error("func must be a function.") logger.error("func must be a function.")
raise TypeError("func must be a function.") raise TypeError("func must be a function.")
for row_data in self: for row_data in self.create_tuple_iterator(output_numpy=True):
if dataset is None: if dataset is None:
dataset = func(row_data) dataset = func(row_data)
else: else:
@ -1133,7 +1133,7 @@ class Dataset:
return SaveOp(self).save(file_names, file_type) return SaveOp(self).save(file_names, file_type)
def create_tuple_iterator(self, columns=None, num_epochs=-1): def create_tuple_iterator(self, columns=None, num_epochs=-1, output_numpy=False):
""" """
Create an Iterator over the dataset. The data retrieved will be a list of ndarray of data. Create an Iterator over the dataset. The data retrieved will be a list of ndarray of data.
@ -1143,8 +1143,11 @@ class Dataset:
Args: Args:
columns (list[str], optional): List of columns to be used to specify the order of columns columns (list[str], optional): List of columns to be used to specify the order of columns
(default=None, means all columns). (default=None, means all columns).
num_epochs (int, optional): max epochs that iterator can be iteratered, num_epochs (int, optional): maximum epochs that iterator can be iteratered,
if num_epochs = -1, iterator can be iteratered infinit epochs (default=-1) if num_epochs = -1, iterator can be iteratered infinite epochs (default=-1)
output_numpy (bool, optional): Whether or not to output NumPy datatype,
if output_numpy=False, iterator will output MSTensor (default=False).
Returns: Returns:
Iterator, list of ndarray. Iterator, list of ndarray.
@ -1161,9 +1164,9 @@ class Dataset:
""" """
if self._noop_mode(): if self._noop_mode():
return DummyIterator(self, 'tuple') return DummyIterator(self, 'tuple')
return TupleIterator(self, columns, num_epochs) return TupleIterator(self, columns, num_epochs, output_numpy)
def create_dict_iterator(self, num_epochs=-1): def create_dict_iterator(self, num_epochs=-1, output_numpy=False):
""" """
Create an Iterator over the dataset. Create an Iterator over the dataset.
@ -1171,8 +1174,10 @@ class Dataset:
of the columns in the dictionary may not be the same as the original order. of the columns in the dictionary may not be the same as the original order.
Args: Args:
num_epochs (int, optional): max epochs that iterator can be iteratered, num_epochs (int, optional): maximum epochs that iterator can be iteratered,
if num_epochs = -1, iterator can be iteratered infinit epochs (default=-1) if num_epochs = -1, iterator can be iteratered infinite epochs (default=-1)
output_numpy (bool, optional): Whether or not to output NumPy datatype,
if output_numpy=False, iterator will output MSTensor (default=False).
Returns: Returns:
Iterator, dictionary of column_name-ndarray pair. Iterator, dictionary of column_name-ndarray pair.
@ -1190,7 +1195,7 @@ class Dataset:
""" """
if self._noop_mode(): if self._noop_mode():
return DummyIterator(self, 'dict') return DummyIterator(self, 'dict')
return DictIterator(self, num_epochs) return DictIterator(self, num_epochs, output_numpy)
def __iter__(self): def __iter__(self):
"""Create an Iterator over the dataset.""" """Create an Iterator over the dataset."""
@ -1617,7 +1622,7 @@ class BucketBatchByLengthDataset(DatasetOp):
""" """
if self.dataset_size is None: if self.dataset_size is None:
num_rows = 0 num_rows = 0
for _ in self.create_dict_iterator(num_epochs=1): for _ in self.create_dict_iterator(num_epochs=1, output_numpy=True):
num_rows += 1 num_rows += 1
self.dataset_size = num_rows self.dataset_size = num_rows
return self.dataset_size return self.dataset_size
@ -2163,7 +2168,7 @@ class FilterDataset(DatasetOp):
""" """
if self.dataset_size is None: if self.dataset_size is None:
num_rows = 0 num_rows = 0
for _ in self.create_dict_iterator(num_epochs=1): for _ in self.create_dict_iterator(num_epochs=1, output_numpy=True):
num_rows += 1 num_rows += 1
self.dataset_size = num_rows self.dataset_size = num_rows
return self.dataset_size return self.dataset_size
@ -2400,7 +2405,7 @@ class ConcatDataset(DatasetOp):
""" """
if self.dataset_size is None: if self.dataset_size is None:
num_rows = 0 num_rows = 0
for _ in self.create_dict_iterator(num_epochs=1): for _ in self.create_dict_iterator(num_epochs=1, output_numpy=True):
num_rows += 1 num_rows += 1
self.dataset_size = num_rows self.dataset_size = num_rows
return self.dataset_size return self.dataset_size
@ -3495,7 +3500,7 @@ class GeneratorDataset(MappableDataset):
self.dataset_size = rows_from_sampler self.dataset_size = rows_from_sampler
else: else:
num_rows = 0 num_rows = 0
for _ in self.create_dict_iterator(num_epochs=1): for _ in self.create_dict_iterator(num_epochs=1, output_numpy=True):
num_rows += 1 num_rows += 1
self.dataset_size = num_rows self.dataset_size = num_rows
return self.dataset_size return self.dataset_size

@ -67,8 +67,9 @@ class Iterator:
dataset: Dataset to be iterated over dataset: Dataset to be iterated over
""" """
def __init__(self, dataset, num_epochs=-1): def __init__(self, dataset, num_epochs=-1, output_numpy=False):
self.num_epochs = num_epochs self.num_epochs = num_epochs
self.output_numpy = output_numpy
ITERATORS_LIST.append(weakref.ref(self)) ITERATORS_LIST.append(weakref.ref(self))
# create a copy of tree and work on it. # create a copy of tree and work on it.
self.dataset = copy.deepcopy(dataset) self.dataset = copy.deepcopy(dataset)
@ -305,8 +306,8 @@ class DictIterator(Iterator):
""" """
The derived class of Iterator with dict type. The derived class of Iterator with dict type.
""" """
def __init__(self, dataset, num_epochs=-1): def __init__(self, dataset, num_epochs=-1, output_numpy=False):
super().__init__(dataset, num_epochs) super().__init__(dataset, num_epochs, output_numpy)
self.depipeline.LaunchTreeExec() self.depipeline.LaunchTreeExec()
def check_node_type(self, node): def check_node_type(self, node):
@ -323,7 +324,9 @@ class DictIterator(Iterator):
Dict, the next record in the dataset. Dict, the next record in the dataset.
""" """
return {k: v.as_array() for k, v in self.depipeline.GetNextAsMap().items()} if self.output_numpy:
return {k: v.as_array() for k, v in self.depipeline.GetNextAsMap().items()}
return {k: Tensor(v.as_array()) for k, v in self.depipeline.GetNextAsMap().items()}
class TupleIterator(Iterator): class TupleIterator(Iterator):
@ -333,12 +336,12 @@ class TupleIterator(Iterator):
def check_node_type(self, node): def check_node_type(self, node):
pass pass
def __init__(self, dataset, columns=None, num_epochs=-1): def __init__(self, dataset, columns=None, num_epochs=-1, output_numpy=False):
if columns is not None: if columns is not None:
if not isinstance(columns, list): if not isinstance(columns, list):
columns = [columns] columns = [columns]
dataset = dataset.project(columns) dataset = dataset.project(columns)
super().__init__(dataset, num_epochs) super().__init__(dataset, num_epochs, output_numpy)
self.depipeline.LaunchTreeExec() self.depipeline.LaunchTreeExec()
def __iter__(self): def __iter__(self):
@ -352,7 +355,9 @@ class TupleIterator(Iterator):
List, the next record in the dataset. List, the next record in the dataset.
""" """
return [t.as_array() for t in self.depipeline.GetNextAsList()] if self.output_numpy:
return [t.as_array() for t in self.depipeline.GetNextAsList()]
return [Tensor(t.as_array()) for t in self.depipeline.GetNextAsList()]
class DummyIterator(): class DummyIterator():

@ -18,8 +18,7 @@ import os
from mindspore._checkparam import check_bool, check_int from mindspore._checkparam import check_bool, check_int
from .. import context, nn from .. import context, nn
from ._utils import _exec_datagraph, _get_types_and_shapes, _to_tensor, \ from ._utils import _exec_datagraph, _get_types_and_shapes, _construct_tensor_list
_construct_tensor_list
from ..nn.wrap import GetNextSingleOp from ..nn.wrap import GetNextSingleOp
from ..parallel._utils import _get_device_num, _get_global_rank, _need_to_full, _to_full_shapes from ..parallel._utils import _get_device_num, _get_global_rank, _need_to_full, _to_full_shapes
from ..ops import operations as P from ..ops import operations as P
@ -297,4 +296,4 @@ class _DatasetIterNormal:
def __next__(self): def __next__(self):
data = self.iter.__next__() data = self.iter.__next__()
return _to_tensor(data) return data

@ -19,7 +19,7 @@ import argparse
import time import time
import numpy as np import numpy as np
from pycocotools.coco import COCO from pycocotools.coco import COCO
from mindspore import context, Tensor from mindspore import context
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common import set_seed from mindspore.common import set_seed
@ -68,7 +68,7 @@ def FasterRcnn_eval(dataset_path, ckpt_path, ann_file):
start = time.time() start = time.time()
# run net # run net
output = net(Tensor(img_data), Tensor(img_metas), Tensor(gt_bboxes), Tensor(gt_labels), Tensor(gt_num)) output = net(img_data, img_metas, gt_bboxes, gt_labels, gt_num)
end = time.time() end = time.time()
print("Iter {} cost time {}".format(eval_iter, end - start)) print("Iter {} cost time {}".format(eval_iter, end - start))

@ -57,7 +57,7 @@ def MaskRcnn_eval(dataset_path, ckpt_path, ann_file):
print("total images num: ", total) print("total images num: ", total)
print("Processing, please wait a moment.") print("Processing, please wait a moment.")
max_num = 128 max_num = 128
for data in ds.create_dict_iterator(): for data in ds.create_dict_iterator(output_numpy=True):
eval_iter = eval_iter + 1 eval_iter = eval_iter + 1
img_data = data['image'] img_data = data['image']

@ -109,7 +109,7 @@ def extract_features(net, dataset_path, config):
config=config, config=config,
repeat_num=1) repeat_num=1)
step_size = dataset.get_dataset_size() step_size = dataset.get_dataset_size()
pbar = tqdm(list(dataset.create_dict_iterator())) pbar = tqdm(list(dataset.create_dict_iterator(output_numpy=True)))
model = Model(net) model = Model(net)
i = 0 i = 0
for data in pbar: for data in pbar:

@ -146,7 +146,7 @@ def test(cloud_args=None):
per_batch_size=args.per_batch_size, per_batch_size=args.per_batch_size,
max_epoch=1, rank=args.rank, group_size=args.group_size, max_epoch=1, rank=args.rank, group_size=args.group_size,
mode='eval') mode='eval')
eval_dataloader = de_dataset.create_tuple_iterator() eval_dataloader = de_dataset.create_tuple_iterator(output_numpy=True)
network = get_network(args.backbone, args.num_classes, platform=args.platform) network = get_network(args.backbone, args.num_classes, platform=args.platform)
if network is None: if network is None:
raise NotImplementedError('not implement {}'.format(args.backbone)) raise NotImplementedError('not implement {}'.format(args.backbone))

@ -44,7 +44,7 @@ def ssd_eval(dataset_path, ckpt_path):
print("\n========================================\n") print("\n========================================\n")
print("total images num: ", total) print("total images num: ", total)
print("Processing, please wait a moment.") print("Processing, please wait a moment.")
for data in ds.create_dict_iterator(): for data in ds.create_dict_iterator(output_numpy=True):
img_id = data['img_id'] img_id = data['img_id']
img_np = data['image'] img_np = data['image']
image_shape = data['image_shape'] image_shape = data['image_shape']

@ -159,7 +159,7 @@ def test(cloud_args=None):
for model in args.models: for model in args.models:
dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, mode='eval') dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, mode='eval')
eval_dataloader = dataset.create_tuple_iterator() eval_dataloader = dataset.create_tuple_iterator(output_numpy=True)
network = vgg16(args.num_classes, args, phase="test") network = vgg16(args.num_classes, args, phase="test")
# pre_trained # pre_trained

@ -300,10 +300,10 @@ def test():
input_shape = Tensor(tuple(config.test_img_shape), ms.float32) input_shape = Tensor(tuple(config.test_img_shape), ms.float32)
args.logger.info('Start inference....') args.logger.info('Start inference....')
for i, data in enumerate(ds.create_dict_iterator()): for i, data in enumerate(ds.create_dict_iterator()):
image = Tensor(data["image"]) image = data["image"]
image_shape = Tensor(data["image_shape"]) image_shape = data["image_shape"]
image_id = Tensor(data["img_id"]) image_id = data["img_id"]
prediction = network(image, input_shape) prediction = network(image, input_shape)
output_big, output_me, output_small = prediction output_big, output_me, output_small = prediction

@ -299,7 +299,7 @@ def train():
old_progress = -1 old_progress = -1
t_end = time.time() t_end = time.time()
data_loader = ds.create_dict_iterator() data_loader = ds.create_dict_iterator(output_numpy=True)
for i, data in enumerate(data_loader): for i, data in enumerate(data_loader):
images = data["image"] images = data["image"]

@ -306,10 +306,10 @@ def test():
input_shape = Tensor(tuple(config.test_img_shape), ms.float32) input_shape = Tensor(tuple(config.test_img_shape), ms.float32)
args.logger.info('Start inference....') args.logger.info('Start inference....')
for i, data in enumerate(ds.create_dict_iterator()): for i, data in enumerate(ds.create_dict_iterator()):
image = Tensor(data["image"]) image = data["image"]
image_shape = Tensor(data["image_shape"]) image_shape = data["image_shape"]
image_id = Tensor(data["img_id"]) image_id = data["img_id"]
prediction = network(image, input_shape) prediction = network(image, input_shape)
output_big, output_me, output_small = prediction output_big, output_me, output_small = prediction

@ -303,7 +303,7 @@ def train():
old_progress = -1 old_progress = -1
t_end = time.time() t_end = time.time()
data_loader = ds.create_dict_iterator() data_loader = ds.create_dict_iterator(output_numpy=True)
shape_record = ShapeRecord() shape_record = ShapeRecord()
for i, data in enumerate(data_loader): for i, data in enumerate(data_loader):

@ -44,7 +44,7 @@ def yolo_eval(dataset_path, ckpt_path):
print("\n========================================\n") print("\n========================================\n")
print("total images num: ", total) print("total images num: ", total)
print("Processing, please wait a moment.") print("Processing, please wait a moment.")
for data in ds.create_dict_iterator(): for data in ds.create_dict_iterator(output_numpy=True):
img_np = data['image'] img_np = data['image']
image_shape = data['image_shape'] image_shape = data['image_shape']
annotation = data['annotation'] annotation = data['annotation']

@ -52,7 +52,7 @@ def train_and_eval():
eval_class = BGCFEvaluate(parser, train_graph, test_graph, parser.Ks) eval_class = BGCFEvaluate(parser, train_graph, test_graph, parser.Ks)
itr = train_ds.create_dict_iterator(parser.num_epoch) itr = train_ds.create_dict_iterator(parser.num_epoch, output_numpy=True)
num_iter = int(num_pairs / parser.batch_pairs) num_iter = int(num_pairs / parser.batch_pairs)
for _epoch in range(1, parser.num_epoch + 1): for _epoch in range(1, parser.num_epoch + 1):

@ -29,7 +29,6 @@ from mindspore import context
from mindspore import log as logger from mindspore import log as logger
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum
from mindspore.common.tensor import Tensor
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
@ -123,7 +122,7 @@ def do_eval(dataset=None, network=None, num_class=2, assessment_method="accuracy
for data in dataset.create_dict_iterator(): for data in dataset.create_dict_iterator():
input_data = [] input_data = []
for i in columns_list: for i in columns_list:
input_data.append(Tensor(data[i])) input_data.append(data[i])
input_ids, input_mask, token_type_id, label_ids = input_data input_ids, input_mask, token_type_id, label_ids = input_data
logits = model.predict(input_ids, input_mask, token_type_id, label_ids) logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
callback.update(logits, label_ids) callback.update(logits, label_ids)

@ -30,7 +30,6 @@ from mindspore import context
from mindspore import log as logger from mindspore import log as logger
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum
from mindspore.common.tensor import Tensor
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
@ -132,7 +131,7 @@ def do_eval(dataset=None, network=None, use_crf="", num_class=2, assessment_meth
for data in dataset.create_dict_iterator(): for data in dataset.create_dict_iterator():
input_data = [] input_data = []
for i in columns_list: for i in columns_list:
input_data.append(Tensor(data[i])) input_data.append(data[i])
input_ids, input_mask, token_type_id, label_ids = input_data input_ids, input_mask, token_type_id, label_ids = input_data
logits = model.predict(input_ids, input_mask, token_type_id, label_ids) logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
callback.update(logits, label_ids) callback.update(logits, label_ids)

@ -112,7 +112,7 @@ def do_eval(dataset=None, vocab_file="", eval_json="", load_checkpoint_path="",
for data in dataset.create_dict_iterator(): for data in dataset.create_dict_iterator():
input_data = [] input_data = []
for i in columns_list: for i in columns_list:
input_data.append(Tensor(data[i])) input_data.append(data[i])
input_ids, input_mask, segment_ids, unique_ids = input_data input_ids, input_mask, segment_ids, unique_ids = input_data
start_positions = Tensor([1], mstype.float32) start_positions = Tensor([1], mstype.float32)
end_positions = Tensor([1], mstype.float32) end_positions = Tensor([1], mstype.float32)

@ -107,7 +107,7 @@ def transformer_infer(config, dataset):
probs = [] probs = []
source_sentences = [] source_sentences = []
target_sentences = [] target_sentences = []
for batch in dataset.create_dict_iterator(): for batch in dataset.create_dict_iterator(output_numpy=True):
source_sentences.append(batch["source_eos_ids"]) source_sentences.append(batch["source_eos_ids"])
target_sentences.append(batch["target_eos_ids"]) target_sentences.append(batch["target_eos_ids"])
@ -232,7 +232,7 @@ def transformer_infer_ppl(config, dataset):
lengths = [] lengths = []
source_sentences = [] source_sentences = []
target_sentences = [] target_sentences = []
for batch in dataset.create_dict_iterator(): for batch in dataset.create_dict_iterator(output_numpy=True):
source_sentences.append(batch["source_eos_ids"]) source_sentences.append(batch["source_eos_ids"])
target_sentences.append(batch["target_eos_ids"]) target_sentences.append(batch["target_eos_ids"])

@ -19,7 +19,6 @@ import os
import re import re
import argparse import argparse
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
from mindspore import Tensor
from mindspore import context from mindspore import context
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.callback import TimeMonitor from mindspore.train.callback import TimeMonitor
@ -282,7 +281,7 @@ def do_eval_standalone():
for data in eval_dataset.create_dict_iterator(): for data in eval_dataset.create_dict_iterator():
input_data = [] input_data = []
for i in columns_list: for i in columns_list:
input_data.append(Tensor(data[i])) input_data.append(data[i])
input_ids, input_mask, token_type_id, label_ids = input_data input_ids, input_mask, token_type_id, label_ids = input_data
logits = eval_model(input_ids, token_type_id, input_mask) logits = eval_model(input_ids, token_type_id, input_mask)
callback.update(logits[3], label_ids) callback.update(logits[3], label_ids)

@ -96,7 +96,7 @@ class EvalCallBack(Callback):
for data in self.dataset.create_dict_iterator(): for data in self.dataset.create_dict_iterator():
input_data = [] input_data = []
for i in columns_list: for i in columns_list:
input_data.append(Tensor(data[i])) input_data.append(data[i])
input_ids, input_mask, token_type_id, label_ids = input_data input_ids, input_mask, token_type_id, label_ids = input_data
self.network.set_train(False) self.network.set_train(False)
logits = self.network(input_ids, token_type_id, input_mask) logits = self.network(input_ids, token_type_id, input_mask)

@ -113,7 +113,7 @@ def run_transformer_eval():
predictions = [] predictions = []
source_sents = [] source_sents = []
target_sents = [] target_sents = []
for batch in dataset.create_dict_iterator(): for batch in dataset.create_dict_iterator(output_numpy=True):
source_sents.append(batch["source_eos_ids"]) source_sents.append(batch["source_eos_ids"])
target_sents.append(batch["target_eos_ids"]) target_sents.append(batch["target_eos_ids"])
source_ids = Tensor(batch["source_eos_ids"], mstype.int32) source_ids = Tensor(batch["source_eos_ids"], mstype.int32)

@ -22,7 +22,7 @@ def create_dataset(data_file):
num_parallel_workers=num_readers, num_parallel_workers=num_readers,
shuffle=True) shuffle=True)
index = 0 index = 0
for item in data_set.create_dict_iterator(): for item in data_set.create_dict_iterator(output_numpy=True):
print("example {}: {}".format(index, item)) print("example {}: {}".format(index, item))
index += 1 index += 1
if index % 1000 == 0: if index % 1000 == 0:

@ -28,7 +28,7 @@ args = parser.parse_args()
data_set = ds.MindDataset(args.path) data_set = ds.MindDataset(args.path)
num_iter = 0 num_iter = 0
for item in data_set.create_dict_iterator(): for item in data_set.create_dict_iterator(output_numpy=True):
print(item) print(item)
num_iter += 1 num_iter += 1
print("Total items # is {}".format(num_iter)) print("Total items # is {}".format(num_iter))

@ -22,7 +22,7 @@ def create_dataset(data_file):
num_parallel_workers=num_readers, num_parallel_workers=num_readers,
shuffle=True) shuffle=True)
index = 0 index = 0
for item in data_set.create_dict_iterator(): for item in data_set.create_dict_iterator(output_numpy=True):
print("example {}: {}".format(index, item)) print("example {}: {}".format(index, item))
index += 1 index += 1
if index % 1000 == 0: if index % 1000 == 0:

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save