Refine DataLoader support multi-processing (#23107)

* add DataLoader, Dataset, BatchSampler
5 years ago · 80cf3c3c4d
parent 76d78c6387
commit 80cf3c3c4d
21 changed files with 1958 additions and 190 deletions
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@ -22,21 +22,23 @@
 #include <atomic>
 #include <csignal>
 #include <map>
 #include <set>
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace imperative {
-static std::map<int64_t, pid_t> load_process_pids;
+static std::map<int64_t, std::set<pid_t>> load_process_pids;
-void SetLoadProcessPID(int64_t key, pid_t pid) {
+void SetLoadProcessPIDs(int64_t key, std::set<pid_t> pids) {
-  VLOG(3) << "Dygraph Data Loader: set loader child process PID (" << key
+  VLOG(3) << "DataLoader: set loader child process PID (" << key
-          << ", " << pid << ")";
+          << ", pid number: " << pids.size() << ")";
-  load_process_pids[key] = pid;
+  load_process_pids[key] = pids;
 }
-void EraseLoadProcessPID(int64_t key) {
+void EraseLoadProcessPIDs(int64_t key) {
  auto it = load_process_pids.find(key);
  // Note: Can not find key also possible
  if (it != load_process_pids.end()) {
@ -54,17 +56,21 @@ void EraseLoadProcessPID(int64_t key) {
 // siginfo_t doc: https://www.mkssoftware.com/docs/man5/siginfo_t.5.asp
 // waitid doc: https://linux.die.net/man/2/waitid
-#define SIGNAL_HANDLE(SIGNAL)                   \
+// clear mmap fds on signal handler, make sure mmap clear will be called
-  do {                                          \
+// on signal handling and no need to register mmap clear up handler on
-    struct sigaction sa;                        \
+// python side. If shared memory is not used Clear() will do nothing.
-    sa.sa_handler = SIG_DFL;                    \
+#define SIGNAL_HANDLE(SIGNAL)                               \
-    sa.sa_flags = 0;                            \
+  do {                                                      \
-    if (sigemptyset(&sa.sa_mask) != 0 ||        \
+    memory::allocation::MemoryMapFdSet::Instance().Clear(); \
-        sigaction(SIGNAL, &sa, nullptr) != 0) { \
+    struct sigaction sa;                                    \
-      _exit(EXIT_FAILURE);                      \
+    sa.sa_handler = SIG_DFL;                                \
-    } else {                                    \
+    sa.sa_flags = 0;                                        \
-      raise(SIGNAL);                            \
+    if (sigemptyset(&sa.sa_mask) != 0 ||                    \
-    }                                           \
+        sigaction(SIGNAL, &sa, nullptr) != 0) {             \
      _exit(EXIT_FAILURE);                                  \
    } else {                                                \
      raise(SIGNAL);                                        \
    }                                                       \
  } while (0)
 #define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME)             \
@ -106,44 +112,62 @@ void SetLoadProcessSignalHandler() {
 void ThrowErrorIfLoadProcessFailed() {
  int error;
  std::set<pid_t> *pids_set;
  pid_t process_pid;
  siginfo_t infop;
-  for (auto &w : load_process_pids) {
+  for (auto &p : load_process_pids) {
-    process_pid = w.second;
+    pids_set = &(p.second);
-    // Use waitid rather than waitpid so that we can set NOWAIT, and that Python
+    for (auto pid_it = pids_set->begin(); pid_it != pids_set->end(); ++pid_it) {
-    // and other handlers can get whatever info they want about the child.
+      process_pid = *pid_it;
-    infop.si_pid = 0;
+      // Use waitid rather than waitpid so that we can set NOWAIT, and that
-    VLOG(3) << "Dygraph Data Loader: monitor loader child process "
+      // Python and other handlers can get whatever info they want about the
-            << process_pid;
+      // child.
-    error = waitid(P_PID, process_pid, &infop, WEXITED | WNOHANG | WNOWAIT);
+      infop.si_pid = 0;
-    // ignore errors and case with no waitable child
+      VLOG(3) << "DataLoader: monitor loader child process " << process_pid;
-    if (error < 0 || infop.si_pid == 0) continue;
+      error = waitid(P_PID, process_pid, &infop, WEXITED | WNOHANG | WNOWAIT);
-    if (infop.si_code == CLD_EXITED &&
+      // ignore errors and case with no waitable child
-        infop.si_status != EXIT_SUCCESS) {  // exit with error
+      if (error < 0 || infop.si_pid == 0) continue;
-      PADDLE_THROW(platform::errors::Fatal(
+      if (infop.si_code == CLD_EXITED &&
-          "DataLoader process (pid %ld) exited unexpectedly with code %d. "
+          infop.si_status != EXIT_SUCCESS) {  // exit with error
-          "Error detailed are lost due to multiprocessing. Rerunning with "
+        pids_set->clear();
          "DataLoader.from_generator(..., use_multiprocess=False) may give "
          "better error trace.",
          process_pid, infop.si_status));
    } else if (infop.si_code == CLD_KILLED ||
               infop.si_code == CLD_DUMPED) {  // killed by signal
      if (infop.si_status == SIGBUS) {
        PADDLE_THROW(platform::errors::Fatal(
-            "DataLoader process (pid %ld) exited is killed by signal: %s.\n"
+            "DataLoader process (pid %ld) exited unexpectedly with code %d. "
-            "  It may be caused by insufficient shared storage space. This "
+            "Error detailed are lost due to multiprocessing. Rerunning with:\n"
-            "problem usually occurs when using docker as a development "
+            "  1. If run DataLoader by DataLoader.from_generator(...), run "
-            "environment.\n  Please use command `df -h` to check the storage "
+            "with "
-            "space of `/dev/shm`. Shared storage space needs to be greater "
+            "DataLoader.from_generator(..., use_multiprocess=False) may give "
-            "than (DataLoader Num * DataLoader queue capacity * 1 batch data "
+            "better error trace.\n"
-            "size).\n  You can solve this problem by increasing the shared "
+            "  2. If run DataLoader by DataLoader(dataset, ...), run with "
-            "storage space or reducing the queue capacity appropriately.",
+            "DataLoader(dataset, ..., num_workers=0) may give better error "
-            process_pid, strsignal(infop.si_status)));
+            "trace",
-      } else {
+            process_pid, infop.si_status));
-        PADDLE_THROW(platform::errors::Fatal(
+      } else if (infop.si_code == CLD_KILLED ||
-            "DataLoader process (pid %ld) exited is killed by signal: %s.",
+                 infop.si_code == CLD_DUMPED) {  // killed by signal
-            process_pid, strsignal(infop.si_status)));
+        if (infop.si_status == SIGBUS) {
          pids_set->clear();
          PADDLE_THROW(platform::errors::Fatal(
              "DataLoader process (pid %ld) exited is killed by signal: %s.\n"
              "  It may be caused by insufficient shared storage space. This "
              "problem usually occurs when using docker as a development "
              "environment.\n  Please use command `df -h` to check the storage "
              "space of `/dev/shm`. Shared storage space needs to be greater "
              "than (DataLoader Num * DataLoader queue capacity * 1 batch data "
              "size).\n  You can solve this problem by increasing the shared "
              "storage space or reducing the queue capacity appropriately.\n",
              "  1. If run DataLoader by DataLoader.from_generator(...), queue "
              "capacity is set by from_generator(..., capacity=xx, ...).\n"
              "  2. If run DataLoader by DataLoader(dataset, ...), queue "
              "capacity is set as 2 times of the max value of num_workers and "
              "len(places).\n"
              "  3. If run by DataLoader(dataset, ..., use_shared_memory=True),"
              " set use_shared_memory=False for not using shared memory.",
              process_pid, strsignal(infop.si_status)));
        } else {
          PADDLE_THROW(platform::errors::Fatal(
              "DataLoader process (pid %ld) exited is killed by signal: %s.",
              process_pid, strsignal(infop.si_status)));
        }
      }
    }
  }
--- a/paddle/fluid/imperative/data_loader.h
+++ b/paddle/fluid/imperative/data_loader.h
@ -18,12 +18,13 @@
 #include <unistd.h>
 #include <cstdint>
 #include <set>
 namespace paddle {
 namespace imperative {
-extern void SetLoadProcessPID(int64_t key, pid_t pid);
+extern void SetLoadProcessPIDs(int64_t key, std::set<pid_t> pids);
-extern void EraseLoadProcessPID(int64_t key);
+extern void EraseLoadProcessPIDs(int64_t key);
 extern void SetLoadProcessSignalHandler();
 extern void ThrowErrorIfLoadProcessFailed();
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@ -20,6 +20,7 @@ limitations under the License. */
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
 #include <memory>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <utility>
@ -290,11 +291,22 @@ void BindImperative(py::module *m_ptr) {
 #ifndef _WIN32
  // Dygraph DataLoader signal handler
-  m.def("_set_process_pid", [](int64_t key, pid_t pid) {
+  m.def("_set_process_pids", [](int64_t key, py::object &obj) {
-    imperative::SetLoadProcessPID(key, pid);
+    PADDLE_ENFORCE_EQ(
        py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj), true,
        platform::errors::InvalidArgument(
            "The subprocess ids set in DataLoader is illegal."
            "Expected data type is tuple or list, but received %s",
            obj.get_type()));
    py::list pids = py::cast<py::list>(obj);
    std::set<pid_t> pids_set = {};
    for (size_t i = 0; i < pids.size(); i++) {
      pids_set.insert(pids[i].cast<pid_t>());
    }
    imperative::SetLoadProcessPIDs(key, pids_set);
  });
-  m.def("_erase_process_pid",
+  m.def("_erase_process_pids",
-        [](int64_t key) { imperative::EraseLoadProcessPID(key); });
+        [](int64_t key) { imperative::EraseLoadProcessPIDs(key); });
  m.def("_set_process_signal_handler",
        []() { imperative::SetLoadProcessSignalHandler(); });
  m.def("_throw_error_if_process_failed",
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@ -252,7 +252,9 @@ def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
 def terminate_local_procs(procs):
    for p in procs:
        if p.proc.poll() is None:
-            p.proc.terminate()
+            # subprocess need to release resource(e.g. shared memory)
            # use join to wait subprocess releasing
            p.proc.join(timeout=1)
            p.log_fn.close()
            logger.debug("terminate process id:{}".format(p.proc.pid))
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@ -185,8 +185,8 @@ if avx_supported():
        from .core_avx import _load_dygraph_dict
        from .core_avx import _create_loaded_parameter
        if sys.platform != 'win32':
-            from .core_avx import _set_process_pid
+            from .core_avx import _set_process_pids
-            from .core_avx import _erase_process_pid
+            from .core_avx import _erase_process_pids
            from .core_avx import _set_process_signal_handler
            from .core_avx import _throw_error_if_process_failed
            from .core_avx import _convert_to_tensor_list
@ -229,8 +229,8 @@ if load_noavx:
        from .core_noavx import _load_dygraph_dict
        from .core_noavx import _create_loaded_parameter
        if sys.platform != 'win32':
-            from .core_noavx import _set_process_pid
+            from .core_noavx import _set_process_pids
-            from .core_noavx import _erase_process_pid
+            from .core_noavx import _erase_process_pids
            from .core_noavx import _set_process_signal_handler
            from .core_noavx import _throw_error_if_process_failed
            from .core_noavx import _convert_to_tensor_list
--- a/python/paddle/fluid/dataloader/init.py
+++ b/python/paddle/fluid/dataloader/init.py
@ -0,0 +1,24 @@
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 from . import dataset
 from .dataset import *
 from . import batch_sampler
 from .batch_sampler import *
 __all__ = dataset.__all__ \
        + batch_sampler.__all__
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@ -0,0 +1,143 @@
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 from __future__ import division
 import numpy as np
 from .dataset import Dataset
 __all__ = ["BatchSampler"]
 class BatchSampler(object):
    """
    A base implement of batch sampler used by `paddle.io.DataLoader`
    which yield mini-batch indices(a list/tuple with length as
    mini-batch size and holds sample indices) iterably.
    Batch sampler used by :code:`paddle.io.DataLoader` should be a subclass
    of :code:`paddle.io.BatchSampler`, BatchSampler subclasses should
    implement following methods:
    :code:`__iter__`: return mini-batch indices iterably.
    :code:`__len__`: get mini-batch number in an epoch.
    Args:
        dataset(Dataset): this could be a :code:`paddle.io.Dataset` 
                implement or other python object which implemented
                :code:`__len__` for BatchSampler to get indices as the
                range of :attr:`dataset` length. Default None.
        indices (list|tuple): a substitution parameter for
                :attr:`dataset` either :attr:`dataset` or
                :attr:`indices` should be set, give the whole
                indices to sampler from directly. Default None.
        shuffle(bool): whether to shuffle indices order before genrating
                batch indices. Default False.
        batch_size(int): sample indice number in a mini-batch indices.
        drop_last(bool): whether drop the last incomplete batch dataset size
            is not divisible by the batch size. Default False
    Returns:
        BatchSampler: an iterable object for indices iterating
    Examples:
        .. code-block:: python
            from paddle.io import BatchSampler, Dataset
            # init with indices
            bs = BatchSampler(indices=list(range(100)),
                              shuffle=True,
                              batch_size=8,
                              drop_last=True)
            for batch_indices in bs:
                print(batch_indices)
            # init with dataset
            class RandomDataset(Dataset):
                def __init__(self, num_samples):
                    self.num_samples = num_samples
                def __getitem__(self, idx):
                    image = np.random.random([784]).astype('float32')
                    label = np.random.randint(0, 9, (1, )).astype('int64')
                    return image, label
                def __len__(self):
                    return self.num_samples
            bs = BatchSampler(dataset=RandomDataset(100),
                              shuffle=False,
                              batch_size=16,
                              drop_last=False)
            for batch_indices in bs:
                print(batch_indices)
    see `paddle.io.DataLoader`
    """
    def __init__(self,
                 dataset=None,
                 indices=None,
                 shuffle=False,
                 batch_size=1,
                 drop_last=False):
        if dataset is None:
            assert indices is not None, \
                "either dataset or indices should be set"
            assert isinstance(indices, list) or isinstance(indices, tuple), \
                "indices should be a list or tuple, but got {}".format(type(indices))
            self.indices = indices
        else:
            assert isinstance(dataset, Dataset), \
                "dataset should be an instance of paddle.io.Dataset"
            assert indices is None, \
                "should not set both dataset and indices"
            self.indices = list(range(len(dataset)))
        assert isinstance(batch_size, int) and batch_size > 0, \
            "batch_size should be a positive integer, but got {}".format(batch_size)
        self.batch_size = batch_size
        assert isinstance(shuffle, bool), \
            "shuffle should be a boolean value, but got {}".format(type(shuffle))
        self.shuffle = shuffle
        assert isinstance(drop_last, bool), \
            "drop_last should be a boolean value, but got {}".format(type(drop_last))
        self.drop_last = drop_last
    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
        _iter = iter(self.indices)
        batch_indices = []
        for idx in _iter:
            batch_indices.append(idx)
            if len(batch_indices) == self.batch_size:
                yield batch_indices
                batch_indices = []
        if not self.drop_last and len(batch_indices) > 0:
            yield batch_indices
    def __len__(self):
        num_samples = len(self.indices)
        num_samples += int(not self.drop_last) * (self.batch_size - 1)
        return num_samples // self.batch_size
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@ -0,0 +1,73 @@
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import paddle.dataset.common
 __all__ = ["Dataset"]
 class Dataset(object):
    """
    An abstract class to encapsulates methods and behaviors of datasets.
    All datasets in map-style(dataset samples can be get by a given key)
    should be a subclass of `paddle.io.Dataset`. All subclasses should
    implement following methods:
    :code:`__getitem__`: get sample from dataset with a given index. This
    method is required by reading dataset sample in :code:`paddle.io.DataLoader`.
    :code:`__len__`: return dataset sample number. This method is required
    by some implements of :code:`paddle.io.BatchSampler`
    see :code:`paddle.io.DataLoader`.
    Examples:
        .. code-block:: python
            import numpy as np
            from paddle.io import Dataset
            # define a random dataset
            class RandomDataset(Dataset):
                def __init__(self, num_samples):
                    self.num_samples = num_samples
                def __getitem__(self, idx):
                    image = np.random.random([784]).astype('float32')
                    label = np.random.randint(0, 9, (1, )).astype('int64')
                    return image, label
                def __len__(self):
                    return self.num_samples
            dataset = RandomDataset(10)
            for i in range(len(dataset)):
                print(dataset[i])
    """
    def __init__(self):
        pass
    def __getitem__(self, idx):
        raise NotImplementedError("'{}' not implement in class "\
                "{}".format('__getitem__', self.__class__.__name__))
    def __len__(self):
        raise NotImplementedError("'{}' not implement in class "\
                "{}".format('__len__', self.__class__.__name__))
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@ -37,6 +37,8 @@ from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.log_helper import get_logger
 from . import reader
 from .reader import *
 from . import dataloader
 from .dataloader import *
 from . import core
 from .. import compat as cpt
--- a/python/paddle/fluid/multiprocess_utils.py
+++ b/python/paddle/fluid/multiprocess_utils.py
@ -0,0 +1,139 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import six
 import sys
 import signal
 import atexit
 from . import core
 # NOTE: queue has a different name in python2 and python3
 if six.PY2:
    import Queue as queue
 else:
    import queue
 # NOTE: [ mmap files clear ] If there is still data in the multiprocess queue when the main process finishes reading,
 # the data in the queue needs to be popped. Then the LoDTensor read by the main process
 # from the child process will automatically clear the memory-mapped file.
 multiprocess_queue_set = set()
 def _clear_multiprocess_queue_set():
    global multiprocess_queue_set
    for data_queue in multiprocess_queue_set:
        while True:
            try:
                data_queue.get_nowait()
            except queue.Empty:
                break
 # NOTE: main process clear function at exit
 def _cleanup():
    # NOTE: inter-process Queue shared memory objects clear function
    _clear_multiprocess_queue_set()
    # NOTE: main process memory map files clear funciton
    core._cleanup_mmap_fds()
 # NOTE: for child process clear function at exit
 def _cleanup_mmap():
    # clear memory map files in child process
    core._cleanup_mmap_fds()
 # NOTE used for register a function to be executed at interpreter exit.
 class CleanupFuncRegistrar():
    # Record the cleanup functions that have been executed
    _executed_func_set = set()
    # Record the cleanup functions that have been registered
    _registered_func_set = set()
    @classmethod
    def register(cls, function, signals=[]):
        def _func_exectuor():
            if function not in cls._executed_func_set:
                try:
                    function()
                finally:
                    cls._executed_func_set.add(function)
        def _func_register(function):
            if not callable(function):
                raise TypeError("%s is not callable object." % (function))
            # check function object whether hash-able
            set([function])
            if function not in cls._registered_func_set:
                atexit.register(_func_exectuor)
                cls._registered_func_set.add(function)
        def _signal_handler(signum=None, frame=None):
            _func_exectuor()
            if signum is not None:
                if signum == signal.SIGINT:
                    raise KeyboardInterrupt
                sys.exit(signum)
        def _signal_register(signals):
            signals = set(signals)
            for sig in signals:
                orig_handler = signal.signal(sig, _signal_handler)
                if orig_handler not in (signal.SIG_DFL, signal.SIG_IGN):
                    if (sig == signal.SIGINT and
                            orig_handler is signal.default_int_handler):
                        continue
                    if orig_handler not in cls._registered_func_set:
                        atexit.register(orig_handler)
                        cls._registered_func_set.add(orig_handler)
        # deal with signals
        _signal_register(signals)
        # deal with function
        _func_register(function)
 # NOTE: [ mmap files clear ] When the main process exits unexpectedly, the remaining
 # shared memory objects in the inter-process Queue and the main process (mostly in the
 # BlockingQueue) may not be completely released, resulting in the corresponding
 # memory-mapped file remaining on the disk (/dev/shm), so register this function
 # to clean up shared memory objects in these two queues before the python interpreter exits.
 # NOTE: Currently multi-process DataLoader only supports Linux platform
 if not (sys.platform == 'darwin' or sys.platform == 'win32'):
    CleanupFuncRegistrar.register(_cleanup)
 # ------------ SIGCHLD handler setting --------------
 _SIGCHLD_handler_set = False
 def _set_SIGCHLD_handler():
    global _SIGCHLD_handler_set
    if _SIGCHLD_handler_set:
        return
    current_handler = signal.getsignal(signal.SIGCHLD)
    if not callable(current_handler):
        current_handler = None
    def __handler__(signum, frame):
        # NOTE: Here the signum is SIGCHLD, when the child process exits,
        # this handler will be called whenever the child process exits
        # normally or abnormally.
        core._throw_error_if_process_failed()
        if current_handler is not None:
            current_handler(signum, frame)
    signal.signal(signal.SIGCHLD, __handler__)
    _SIGCHLD_handler_set = True
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -211,6 +211,8 @@ if (APPLE OR WIN32)
  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_fds_clear)
  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exit_func)
  list(REMOVE_ITEM TEST_OPS test_imperative_signal_handler)
  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_base)
  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
 endif()
 if(NOT WITH_GPU OR WIN32 OR APPLE)
@ -381,4 +383,6 @@ if(NOT WIN32 AND NOT APPLE)
    set_tests_properties(test_imperative_data_loader_base PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    set_tests_properties(test_imperative_data_loader_fds_clear PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    # set_tests_properties(test_imperative_data_loader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    set_tests_properties(test_multiprocess_dataloader_base PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
 endif()
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@ -0,0 +1,120 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import division
 import unittest
 import paddle.fluid as fluid
 from paddle.io import BatchSampler, Dataset
 class RandomDataset(Dataset):
    def __init__(self, sample_num, class_num):
        self.sample_num = sample_num
        self.class_num = class_num
    def __getitem__(self, idx):
        np.random.seed(idx)
        image = np.random.random([IMAGE_SIZE]).astype('float32')
        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
        return image, label
    def __len__(self):
        return self.sample_num
 class TestBatchSampler(unittest.TestCase):
    def setUp(self):
        self.num_samples = 1000
        self.num_classes = 10
        self.batch_size = 32
        self.shuffle = False
        self.drop_last = False
    def init_batch_sampler(self):
        dataset = RandomDataset(self.num_samples, self.num_classes)
        bs = BatchSampler(
            dataset=dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            drop_last=self.drop_last)
        return bs
    def test_main(self):
        bs = self.init_batch_sampler()
        # length check
        bs_len = (self.num_samples + int(not self.drop_last) \
                * (self.batch_size - 1)) // self.batch_size
        self.assertTrue(bs_len == len(bs))
        # output indices check
        if not self.shuffle:
            index = 0
            for indices in bs:
                for idx in indices:
                    self.assertTrue(index == idx)
                    index += 1
 class TestBatchSamplerDropLast(TestBatchSampler):
    def setUp(self):
        self.num_samples = 1000
        self.num_classes = 10
        self.batch_size = 32
        self.shuffle = False
        self.drop_last = True
 class TestBatchSamplerShuffle(TestBatchSampler):
    def setUp(self):
        self.num_samples = 1000
        self.num_classes = 10
        self.batch_size = 32
        self.shuffle = True
        self.drop_last = True
 class TestBatchSamplerWithIndices(TestBatchSampler):
    def init_batch_sampler(self):
        bs = BatchSampler(
            indices=list(range(self.num_samples)),
            batch_size=self.batch_size,
            drop_last=self.drop_last)
        return bs
 class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
    def setUp(self):
        self.num_samples = 1000
        self.num_classes = 10
        self.batch_size = 32
        self.shuffle = False
        self.drop_last = True
    def test_main(self):
        try:
            dataset = RandomDataset(self.num_samples, self.num_classes)
            bs = BatchSampler(
                dataset=dataset,
                indices=list(range(self.num_samples)),
                batch_size=self.batch_size,
                drop_last=self.drop_last)
            self.assertTrue(False)
        except AssertionError:
            pass
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
@ -0,0 +1,41 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import division
 import unittest
 import numpy as np
 import paddle.fluid as fluid
 from paddle.io import *
 class TestDatasetAbstract(unittest.TestCase):
    def test_main(self):
        dataset = Dataset()
        try:
            d = dataset[0]
            self.assertTrue(False)
        except NotImplementedError:
            pass
        try:
            l = len(dataset)
            self.assertTrue(False)
        except NotImplementedError:
            pass
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
@ -17,6 +17,7 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.io import Dataset, DataLoader
 def get_random_images_and_labels(image_shape, label_shape):
@ -35,6 +36,20 @@ def batch_generator_creator(batch_size, batch_num):
    return __reader__
 class RandomDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num
    def __getitem__(self, idx):
        np.random.seed(idx)
        image = np.random.random([784]).astype('float32')
        label = np.random.randint(0, 9, (1, )).astype('int64')
        return image, label
    def __len__(self):
        return self.sample_num
 class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
    def setUp(self):
        self.batch_size = 8
@ -74,5 +89,19 @@ class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
                self.run_one_epoch_with_break(loader)
 class TestMultiProcessDataLoaderMmapFdsClear(TestDygraphDataLoaderMmapFdsClear):
    def prepare_data_loader(self):
        place = fluid.CPUPlace()
        with fluid.dygraph.guard(place):
            dataset = RandomDataset(self.batch_size * self.batch_num)
            loader = DataLoader(
                dataset,
                places=place,
                batch_size=self.batch_size,
                drop_last=True,
                num_workers=2)
            return loader
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@ -24,7 +24,7 @@ from paddle.fluid import core
 def set_child_signal_handler(self, child_pid):
-    core._set_process_pid(id(self), child_pid)
+    core._set_process_pids(id(self), tuple([child_pid]))
    current_handler = signal.getsignal(signal.SIGCHLD)
    if not callable(current_handler):
        current_handler = None
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_base.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_base.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@ -0,0 +1,199 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import division
 import os
 import sys
 import six
 import time
 import unittest
 import multiprocessing
 import numpy as np
 import paddle.fluid as fluid
 from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
 class RandomDataset(Dataset):
    def __init__(self, sample_num):
        self.sample_num = sample_num
    def __getitem__(self, idx):
        np.random.seed(idx)
        image = np.random.random([784]).astype('float32')
        label = np.random.randint(0, 9, (1, )).astype('int64')
        return image, label
    def __len__(self):
        return self.sample_num
 class TestDataLoaderAssert(unittest.TestCase):
    def test_main(self):
        place = fluid.cpu_places()[0]
        with fluid.dygraph.guard(place):
            dataset = RandomDataset(100)
            batch_sampler = BatchSampler(dataset=dataset, batch_size=4)
            # dataset is not instance of Dataset
            try:
                loader = DataLoader(dataset=batch_sampler, places=place)
                self.assertTrue(False)
            except AssertionError:
                pass
            # places is None
            try:
                loader = DataLoader(dataset=dataset, places=None)
                self.assertTrue(False)
            except AssertionError:
                pass
            # num_workers < 0
            try:
                loader = DataLoader(
                    dataset=dataset, places=place, num_workers=-1)
                self.assertTrue(False)
            except AssertionError:
                pass
            # timeout < 0
            try:
                loader = DataLoader(dataset=dataset, places=place, timeout=-1)
                self.assertTrue(False)
            except AssertionError:
                pass
            # batch_sampler is not instance of BatchSampler
            try:
                loader = DataLoader(
                    dataset=dataset, places=place, batch_sampler=dataset)
                self.assertTrue(False)
            except AssertionError:
                pass
            # set batch_sampler and shuffle/batch_size/drop_last
            try:
                loader = DataLoader(
                    dataset=dataset,
                    places=place,
                    batch_sampler=batch_sampler,
                    shuffle=True,
                    drop_last=True)
                self.assertTrue(False)
            except AssertionError:
                pass
            # set batch_sampler correctly
            try:
                loader = DataLoader(
                    dataset=dataset, places=place, batch_sampler=batch_sampler)
                self.assertTrue(True)
            except AssertionError:
                self.assertTrue(False)
 # CI Converage cannot record stub in subprocess,
 # HACK a _worker_loop in main process call here
 class TestDataLoaderWorkerLoop(unittest.TestCase):
    def run_without_worker_done(self, use_shared_memory=True):
        try:
            place = fluid.cpu_places()[0]
            with fluid.dygraph.guard(place):
                dataset = RandomDataset(800)
                # test init_fn
                def _init_fn(worker_id):
                    pass
                # test collate_fn
                def _collate_fn(sample_list):
                    return [
                        np.stack(
                            s, axis=0) for s in list(zip(*sample_list))
                    ]
                loader = DataLoader(
                    dataset,
                    num_workers=1,
                    places=place,
                    use_shared_memory=use_shared_memory)
                assert loader.num_workers > 0, \
                    "go to AssertionError and pass in Mac and Windows"
                loader = iter(loader)
                print("loader length", len(loader))
                indices_queue = multiprocessing.Queue()
                for i in range(10):
                    indices_queue.put([i, i + 10])
                indices_queue.put(None)
                loader._worker_loop(
                    loader._dataset, indices_queue, loader._data_queue,
                    loader._workers_done_event, _collate_fn, _init_fn, 0)
                self.assertTrue(False)
        except AssertionError:
            pass
        except Exception:
            self.assertTrue(False)
    def run_with_worker_done(self, use_shared_memory=True):
        try:
            place = fluid.cpu_places()[0]
            with fluid.dygraph.guard(place):
                dataset = RandomDataset(800)
                # test init_fn
                def _init_fn(worker_id):
                    pass
                # test collate_fn
                def _collate_fn(sample_list):
                    return [
                        np.stack(
                            s, axis=0) for s in list(zip(*sample_list))
                    ]
                loader = DataLoader(
                    dataset,
                    num_workers=1,
                    places=place,
                    use_shared_memory=use_shared_memory)
                assert loader.num_workers > 0, \
                    "go to AssertionError and pass in Mac and Windows"
                loader = iter(loader)
                print("loader length", len(loader))
                indices_queue = multiprocessing.Queue()
                for i in range(10):
                    indices_queue.put([i, i + 10])
                indices_queue.put(None)
                loader._workers_done_event.set()
                loader._worker_loop(
                    loader._dataset, indices_queue, loader._data_queue,
                    loader._workers_done_event, _collate_fn, _init_fn, 0)
                self.assertTrue(True)
        except AssertionError:
            pass
        except Exception:
            self.assertTrue(False)
    def test_main(self):
        for use_shared_memory in [True, False]:
            self.run_without_worker_done(use_shared_memory)
            self.run_with_worker_done(use_shared_memory)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/io/init.py
+++ b/python/paddle/io/init.py
@ -13,22 +13,27 @@
 # limitations under the License.
 # TODO: define all functions about input & output in this directory 
-# __all__ = ['Dataset',
+__all__ = [
-#            'Sampler',
+    'Dataset',
-#            'Transform',
+    'BatchSampler',
-#            'DataLoader',
+    #            'Transform',
-#            'load',
+    'DataLoader',
-#            'save',
+    #            'load',
-#            'load_program_state',
+    #            'save',
-#            'set_program_state',
+    #            'load_program_state',
-#            'load_inference_model',
+    #            'set_program_state',
-#            'save_inference_model',
+    #            'load_inference_model',
-#            'batch',
+    #            'save_inference_model',
-#            'shuffle',
+    #            'batch',
-#            'buffered',
+    #            'shuffle',
-#            'cache',
+    #            'buffered',
-#            'chain',
+    #            'cache',
-#            'firstn',
+    #            'chain',
-#            'compose',
+    #            'firstn',
-#            'map_readers',
+    #            'compose',
-#            'xmap_readers']
+    #            'map_readers',
    #            'xmap_readers'
 ]
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, BatchSampler
--- a/python/setup.py.in
+++ b/python/setup.py.in
@ -149,6 +149,7 @@ packages=['paddle',
          'paddle.fluid.proto.profiler',
          'paddle.fluid.distributed',
          'paddle.fluid.layers',
          'paddle.fluid.dataloader',
          'paddle.fluid.contrib',
          'paddle.fluid.contrib.decoder',
          'paddle.fluid.contrib.quantize',
@ -176,6 +177,7 @@ packages=['paddle',
          'paddle.fluid.incubate.fleet.parameter_server.pslib',
          'paddle.fluid.incubate.fleet.collective',
          'paddle.fluid.incubate.fleet.utils',
          'paddle.io',
          'paddle.nn',
          'paddle.nn.functional',
          'paddle.nn.layer',