update DataLoader doc (#28290)

* update DataLoader doc. test=develop
5 years ago · 95b1868366
parent 71d6220772
commit 95b1868366
1 changed files with 35 additions and 68 deletions
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@ -153,18 +153,22 @@ class DataLoader(object):
    multi-process workers will be used to load data asynchronously if
    :attr:`num_workers` is set as a positive number.

-    DataLoader only supports map-style dataset(can get a sample from
-    dataset with a given index) currently, for a map-style dataset,
-    please see :code:`paddle.io.Dataset`.
+    DataLoader supports map-style dataset and iterable-style dataset.

-    batch_sampler please see :code:`paddle.io.BatchSampler`
+    For map-style datast(can get a sample from dataset with a given
+    index), please see :code:`paddle.io.Dataset`.
+
+    For iterable-style datast(get samples from dataset iteratively,
+    like a Python iterator), please see :code:`paddle.io.IterableDataset`.
+
+    For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`

    Args:  
        dataset(Dataset): the dataset to load data from, should be an
            instance of subclass of :code:`paddle.io.Dataset` or
            :code:`paddle.io.IterableDataset`.
        feed_list (list(Tensor)|tuple(Tensor)): feed variable list.
-            The variables should be created by :code:`fluid.data()`.
+            The variables should be created by :code:`paddle.static.data()`.
            :attr:`feed_list` must be set if :attr:`return_list` is
            False. Default None.
        places(list(Place)|tuple(Place)|optional): a list of Place,
@ -173,10 +177,10 @@ class DataLoader(object):
            will be used. Default None.
        return_list (bool): whether the return value on each device is 
            presented as a list. If :attr:`return_list=False`, the return
-            value on each device would be a dict of str -> LoDTensor, where
+            value on each device would be a dict of str -> Tensor, where
            the key of the dict is the name of each fed variables. If 
            :attr:`return_list=True`, the return value on each device would
-            be a list(LoDTensor). :attr:`return_list` can only be True
+            be a list(Tensor). :attr:`return_list` can only be True
            in dynamic graph mode. Default False.
        batch_sampler(BatchSampler): an instance of `paddle.io.BatchSampler`
            to generate batch indices to draw samples from :attr:`dataset`
@ -224,7 +228,8 @@ class DataLoader(object):
            import numpy as np

            import paddle
-            import paddle.fluid as fluid
+            import paddle.nn as nn
+            import paddle.nn.functional as F
            from paddle.io import Dataset, BatchSampler, DataLoader

            BATCH_NUM = 20
@ -234,8 +239,6 @@ class DataLoader(object):
            IMAGE_SIZE = 784
            CLASS_NUM = 10

-            USE_GPU = False # whether use GPU to run model
-
            # define a random dataset
            class RandomDataset(Dataset):
                def __init__(self, num_samples):
@ -251,78 +254,34 @@ class DataLoader(object):

            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)

-            # get places
-            places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
-
-            # --------------------- dygraph mode --------------------
-
-            class SimpleNet(fluid.dygraph.Layer):
+            class SimpleNet(nn.Layer):
                def __init__(self):
                    super(SimpleNet, self).__init__()
-                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
+                    self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)

                def forward(self, image, label=None):
                    return self.fc(image)

-            with fluid.dygraph.guard(places[0]):
-                simple_net = SimpleNet()
-                opt = fluid.optimizer.SGD(learning_rate=1e-3,
-                                          parameter_list=simple_net.parameters())
-
-                loader = DataLoader(dataset,
-                                    batch_size=BATCH_SIZE,
-                                    shuffle=True,
-                                    drop_last=True,
-                                    num_workers=2)
-
-                for e in range(EPOCH_NUM):
-                    for i, (image, label) in enumerate(loader()):
-                        out = simple_net(image)
-                        loss = fluid.layers.cross_entropy(out, label)
-                        avg_loss = fluid.layers.reduce_mean(loss)
-                        avg_loss.backward()
-                        opt.minimize(avg_loss)
-                        simple_net.clear_gradients()
-                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
-
-            # -------------------------------------------------------
-
-            # -------------------- static graph ---------------------
-
-            paddle.enable_static()
-
-            def simple_net(image, label):
-                fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
-                cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
-                loss = fluid.layers.reduce_mean(cross_entropy)
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-                return loss
-
-            image = fluid.data(name='image', shape=[None, IMAGE_SIZE], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-
-            loss = simple_net(image, label)
-
-            exe = fluid.Executor(places[0])
-            exe.run(fluid.default_startup_program())
-
-            prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
+            simple_net = SimpleNet()
+            opt = paddle.optimizer.SGD(learning_rate=1e-3,
+                                      parameters=simple_net.parameters())

            loader = DataLoader(dataset,
-                                feed_list=[image, label],
-                                batch_size=BATCH_SIZE, 
+                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                drop_last=True,
                                num_workers=2)

            for e in range(EPOCH_NUM):
-                for i, data in enumerate(loader()):
-                    l = exe.run(prog, feed=data, fetch_list=[loss], return_numpy=True)
-                    print("Epoch {} batch {}: loss = {}".format(e, i, l[0][0]))
+                for i, (image, label) in enumerate(loader()):
+                    out = simple_net(image)
+                    loss = F.cross_entropy(out, label)
+                    avg_loss = paddle.mean(loss)
+                    avg_loss.backward()
+                    opt.minimize(avg_loss)
+                    simple_net.clear_gradients()
+                    print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))

-            # -------------------------------------------------------
-                

    .. note::
        For reading iterable dataset with multiprocess Dataloader,
@ -439,6 +398,10 @@ class DataLoader(object):
                       use_multiprocess=False,
                       drop_last=True):
        """
+        .. warning::
+          This API will be deprecated in the future, it is recommended to use
+          :code:`paddle.io.DataLoader` which supports multi-processes acceleration.
+
        .. note::
          **The framework ensures that the data loading order of DataLoader is exactly the same as the user-defined data source.**

@ -684,6 +647,10 @@ class DataLoader(object):
    @staticmethod
    def from_dataset(dataset, places, drop_last=True):
        """
+        .. warning::
+          This API will be deprecated in the future, it is recommended to use
+          :code:`paddle.io.DataLoader` which supports multi-processes acceleration.
+
        Create an iterable DataLoader object for loading data from Dataset.    
        Dataset is only supported in Linux system currently.