Refine DataLoader support multi-processing (#23107)

* add DataLoader, Dataset, BatchSampler
6 years ago · 80cf3c3c4d
parent 76d78c6387
commit 80cf3c3c4d
21 changed files with 1958 additions and 190 deletions
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@ -22,21 +22,23 @@
 #include <atomic>
 #include <csignal>
 #include <map>
+#include <set>

+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace imperative {

-static std::map<int64_t, pid_t> load_process_pids;
+static std::map<int64_t, std::set<pid_t>> load_process_pids;

-void SetLoadProcessPID(int64_t key, pid_t pid) {
-  VLOG(3) << "Dygraph Data Loader: set loader child process PID (" << key
-          << ", " << pid << ")";
-  load_process_pids[key] = pid;
+void SetLoadProcessPIDs(int64_t key, std::set<pid_t> pids) {
+  VLOG(3) << "DataLoader: set loader child process PID (" << key
+          << ", pid number: " << pids.size() << ")";
+  load_process_pids[key] = pids;
 }

-void EraseLoadProcessPID(int64_t key) {
+void EraseLoadProcessPIDs(int64_t key) {
  auto it = load_process_pids.find(key);
  // Note: Can not find key also possible
  if (it != load_process_pids.end()) {
@ -54,17 +56,21 @@ void EraseLoadProcessPID(int64_t key) {
 // siginfo_t doc: https://www.mkssoftware.com/docs/man5/siginfo_t.5.asp
 // waitid doc: https://linux.die.net/man/2/waitid

-#define SIGNAL_HANDLE(SIGNAL)                   \
-  do {                                          \
-    struct sigaction sa;                        \
-    sa.sa_handler = SIG_DFL;                    \
-    sa.sa_flags = 0;                            \
-    if (sigemptyset(&sa.sa_mask) != 0 ||        \
-        sigaction(SIGNAL, &sa, nullptr) != 0) { \
-      _exit(EXIT_FAILURE);                      \
-    } else {                                    \
-      raise(SIGNAL);                            \
-    }                                           \
+// clear mmap fds on signal handler, make sure mmap clear will be called
+// on signal handling and no need to register mmap clear up handler on
+// python side. If shared memory is not used Clear() will do nothing.
+#define SIGNAL_HANDLE(SIGNAL)                               \
+  do {                                                      \
+    memory::allocation::MemoryMapFdSet::Instance().Clear(); \
+    struct sigaction sa;                                    \
+    sa.sa_handler = SIG_DFL;                                \
+    sa.sa_flags = 0;                                        \
+    if (sigemptyset(&sa.sa_mask) != 0 ||                    \
+        sigaction(SIGNAL, &sa, nullptr) != 0) {             \
+      _exit(EXIT_FAILURE);                                  \
+    } else {                                                \
+      raise(SIGNAL);                                        \
+    }                                                       \
  } while (0)

 #define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME)             \
@ -106,44 +112,62 @@ void SetLoadProcessSignalHandler() {

 void ThrowErrorIfLoadProcessFailed() {
  int error;
+  std::set<pid_t> *pids_set;
  pid_t process_pid;
  siginfo_t infop;

-  for (auto &w : load_process_pids) {
-    process_pid = w.second;
-    // Use waitid rather than waitpid so that we can set NOWAIT, and that Python
-    // and other handlers can get whatever info they want about the child.
-    infop.si_pid = 0;
-    VLOG(3) << "Dygraph Data Loader: monitor loader child process "
-            << process_pid;
-    error = waitid(P_PID, process_pid, &infop, WEXITED | WNOHANG | WNOWAIT);
-    // ignore errors and case with no waitable child
-    if (error < 0 || infop.si_pid == 0) continue;
-    if (infop.si_code == CLD_EXITED &&
-        infop.si_status != EXIT_SUCCESS) {  // exit with error
-      PADDLE_THROW(platform::errors::Fatal(
-          "DataLoader process (pid %ld) exited unexpectedly with code %d. "
-          "Error detailed are lost due to multiprocessing. Rerunning with "
-          "DataLoader.from_generator(..., use_multiprocess=False) may give "
-          "better error trace.",
-          process_pid, infop.si_status));
-    } else if (infop.si_code == CLD_KILLED ||
-               infop.si_code == CLD_DUMPED) {  // killed by signal
-      if (infop.si_status == SIGBUS) {
+  for (auto &p : load_process_pids) {
+    pids_set = &(p.second);
+    for (auto pid_it = pids_set->begin(); pid_it != pids_set->end(); ++pid_it) {
+      process_pid = *pid_it;
+      // Use waitid rather than waitpid so that we can set NOWAIT, and that
+      // Python and other handlers can get whatever info they want about the
+      // child.
+      infop.si_pid = 0;
+      VLOG(3) << "DataLoader: monitor loader child process " << process_pid;
+      error = waitid(P_PID, process_pid, &infop, WEXITED | WNOHANG | WNOWAIT);
+      // ignore errors and case with no waitable child
+      if (error < 0 || infop.si_pid == 0) continue;
+      if (infop.si_code == CLD_EXITED &&
+          infop.si_status != EXIT_SUCCESS) {  // exit with error
+        pids_set->clear();
        PADDLE_THROW(platform::errors::Fatal(
-            "DataLoader process (pid %ld) exited is killed by signal: %s.\n"
-            "  It may be caused by insufficient shared storage space. This "
-            "problem usually occurs when using docker as a development "
-            "environment.\n  Please use command `df -h` to check the storage "
-            "space of `/dev/shm`. Shared storage space needs to be greater "
-            "than (DataLoader Num * DataLoader queue capacity * 1 batch data "
-            "size).\n  You can solve this problem by increasing the shared "
-            "storage space or reducing the queue capacity appropriately.",
-            process_pid, strsignal(infop.si_status)));
-      } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "DataLoader process (pid %ld) exited is killed by signal: %s.",
-            process_pid, strsignal(infop.si_status)));
+            "DataLoader process (pid %ld) exited unexpectedly with code %d. "
+            "Error detailed are lost due to multiprocessing. Rerunning with:\n"
+            "  1. If run DataLoader by DataLoader.from_generator(...), run "
+            "with "
+            "DataLoader.from_generator(..., use_multiprocess=False) may give "
+            "better error trace.\n"
+            "  2. If run DataLoader by DataLoader(dataset, ...), run with "
+            "DataLoader(dataset, ..., num_workers=0) may give better error "
+            "trace",
+            process_pid, infop.si_status));
+      } else if (infop.si_code == CLD_KILLED ||
+                 infop.si_code == CLD_DUMPED) {  // killed by signal
+        if (infop.si_status == SIGBUS) {
+          pids_set->clear();
+          PADDLE_THROW(platform::errors::Fatal(
+              "DataLoader process (pid %ld) exited is killed by signal: %s.\n"
+              "  It may be caused by insufficient shared storage space. This "
+              "problem usually occurs when using docker as a development "
+              "environment.\n  Please use command `df -h` to check the storage "
+              "space of `/dev/shm`. Shared storage space needs to be greater "
+              "than (DataLoader Num * DataLoader queue capacity * 1 batch data "
+              "size).\n  You can solve this problem by increasing the shared "
+              "storage space or reducing the queue capacity appropriately.\n",
+              "  1. If run DataLoader by DataLoader.from_generator(...), queue "
+              "capacity is set by from_generator(..., capacity=xx, ...).\n"
+              "  2. If run DataLoader by DataLoader(dataset, ...), queue "
+              "capacity is set as 2 times of the max value of num_workers and "
+              "len(places).\n"
+              "  3. If run by DataLoader(dataset, ..., use_shared_memory=True),"
+              " set use_shared_memory=False for not using shared memory.",
+              process_pid, strsignal(infop.si_status)));
+        } else {
+          PADDLE_THROW(platform::errors::Fatal(
+              "DataLoader process (pid %ld) exited is killed by signal: %s.",
+              process_pid, strsignal(infop.si_status)));
+        }
      }
    }
  }
--- a/paddle/fluid/imperative/data_loader.h
+++ b/paddle/fluid/imperative/data_loader.h
@ -18,12 +18,13 @@

 #include <unistd.h>
 #include <cstdint>
+#include <set>

 namespace paddle {
 namespace imperative {

-extern void SetLoadProcessPID(int64_t key, pid_t pid);
-extern void EraseLoadProcessPID(int64_t key);
+extern void SetLoadProcessPIDs(int64_t key, std::set<pid_t> pids);
+extern void EraseLoadProcessPIDs(int64_t key);
 extern void SetLoadProcessSignalHandler();
 extern void ThrowErrorIfLoadProcessFailed();

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@ -20,6 +20,7 @@ limitations under the License. */
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
 #include <memory>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <utility>
@ -290,11 +291,22 @@ void BindImperative(py::module *m_ptr) {

 #ifndef _WIN32
  // Dygraph DataLoader signal handler
-  m.def("_set_process_pid", [](int64_t key, pid_t pid) {
-    imperative::SetLoadProcessPID(key, pid);
+  m.def("_set_process_pids", [](int64_t key, py::object &obj) {
+    PADDLE_ENFORCE_EQ(
+        py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj), true,
+        platform::errors::InvalidArgument(
+            "The subprocess ids set in DataLoader is illegal."
+            "Expected data type is tuple or list, but received %s",
+            obj.get_type()));
+    py::list pids = py::cast<py::list>(obj);
+    std::set<pid_t> pids_set = {};
+    for (size_t i = 0; i < pids.size(); i++) {
+      pids_set.insert(pids[i].cast<pid_t>());
+    }
+    imperative::SetLoadProcessPIDs(key, pids_set);
  });
-  m.def("_erase_process_pid",
-        [](int64_t key) { imperative::EraseLoadProcessPID(key); });
+  m.def("_erase_process_pids",
+        [](int64_t key) { imperative::EraseLoadProcessPIDs(key); });
  m.def("_set_process_signal_handler",
        []() { imperative::SetLoadProcessSignalHandler(); });
  m.def("_throw_error_if_process_failed",
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@ -252,7 +252,9 @@ def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
 def terminate_local_procs(procs):
    for p in procs:
        if p.proc.poll() is None:
-            p.proc.terminate()
+            # subprocess need to release resource(e.g. shared memory)
+            # use join to wait subprocess releasing
+            p.proc.join(timeout=1)
            p.log_fn.close()
            logger.debug("terminate process id:{}".format(p.proc.pid))

--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@ -185,8 +185,8 @@ if avx_supported():
        from .core_avx import _load_dygraph_dict
        from .core_avx import _create_loaded_parameter
        if sys.platform != 'win32':
-            from .core_avx import _set_process_pid
-            from .core_avx import _erase_process_pid
+            from .core_avx import _set_process_pids
+            from .core_avx import _erase_process_pids
            from .core_avx import _set_process_signal_handler
            from .core_avx import _throw_error_if_process_failed
            from .core_avx import _convert_to_tensor_list
@ -229,8 +229,8 @@ if load_noavx:
        from .core_noavx import _load_dygraph_dict
        from .core_noavx import _create_loaded_parameter
        if sys.platform != 'win32':
-            from .core_noavx import _set_process_pid
-            from .core_noavx import _erase_process_pid
+            from .core_noavx import _set_process_pids
+            from .core_noavx import _erase_process_pids
            from .core_noavx import _set_process_signal_handler
            from .core_noavx import _throw_error_if_process_failed
            from .core_noavx import _convert_to_tensor_list
--- a/python/paddle/fluid/dataloader/init.py
+++ b/python/paddle/fluid/dataloader/init.py
@ -0,0 +1,24 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import dataset
+from .dataset import *
+
+from . import batch_sampler
+from .batch_sampler import *
+
+__all__ = dataset.__all__ \
+        + batch_sampler.__all__
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@ -0,0 +1,143 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+from .dataset import Dataset
+
+__all__ = ["BatchSampler"]
+
+
+class BatchSampler(object):
+    """
+    A base implement of batch sampler used by `paddle.io.DataLoader`
+    which yield mini-batch indices(a list/tuple with length as
+    mini-batch size and holds sample indices) iterably.
+
+    Batch sampler used by :code:`paddle.io.DataLoader` should be a subclass
+    of :code:`paddle.io.BatchSampler`, BatchSampler subclasses should
+    implement following methods:
+
+    :code:`__iter__`: return mini-batch indices iterably.
+
+    :code:`__len__`: get mini-batch number in an epoch.
+
+
+    Args:
+        dataset(Dataset): this could be a :code:`paddle.io.Dataset` 
+                implement or other python object which implemented
+                :code:`__len__` for BatchSampler to get indices as the
+                range of :attr:`dataset` length. Default None.
+        indices (list|tuple): a substitution parameter for
+                :attr:`dataset` either :attr:`dataset` or
+                :attr:`indices` should be set, give the whole
+                indices to sampler from directly. Default None.
+        shuffle(bool): whether to shuffle indices order before genrating
+                batch indices. Default False.
+        batch_size(int): sample indice number in a mini-batch indices.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+
+    Returns:
+        BatchSampler: an iterable object for indices iterating
+
+    Examples:
+        
+        .. code-block:: python
+            
+            from paddle.io import BatchSampler, Dataset
+
+            # init with indices
+            bs = BatchSampler(indices=list(range(100)),
+                              shuffle=True,
+                              batch_size=8,
+                              drop_last=True)
+
+            for batch_indices in bs:
+                print(batch_indices)
+
+            # init with dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+            
+            bs = BatchSampler(dataset=RandomDataset(100),
+                              shuffle=False,
+                              batch_size=16,
+                              drop_last=False)
+
+            for batch_indices in bs:
+                print(batch_indices)
+
+    see `paddle.io.DataLoader`
+
+    """
+
+    def __init__(self,
+                 dataset=None,
+                 indices=None,
+                 shuffle=False,
+                 batch_size=1,
+                 drop_last=False):
+        if dataset is None:
+            assert indices is not None, \
+                "either dataset or indices should be set"
+            assert isinstance(indices, list) or isinstance(indices, tuple), \
+                "indices should be a list or tuple, but got {}".format(type(indices))
+            self.indices = indices
+        else:
+            assert isinstance(dataset, Dataset), \
+                "dataset should be an instance of paddle.io.Dataset"
+            assert indices is None, \
+                "should not set both dataset and indices"
+            self.indices = list(range(len(dataset)))
+
+        assert isinstance(batch_size, int) and batch_size > 0, \
+            "batch_size should be a positive integer, but got {}".format(batch_size)
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), \
+            "shuffle should be a boolean value, but got {}".format(type(shuffle))
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+            "drop_last should be a boolean value, but got {}".format(type(drop_last))
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.indices)
+        _iter = iter(self.indices)
+
+        batch_indices = []
+        for idx in _iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = len(self.indices)
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@ -0,0 +1,73 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.dataset.common
+
+__all__ = ["Dataset"]
+
+
+class Dataset(object):
+    """
+    An abstract class to encapsulates methods and behaviors of datasets.
+
+    All datasets in map-style(dataset samples can be get by a given key)
+    should be a subclass of `paddle.io.Dataset`. All subclasses should
+    implement following methods:
+
+    :code:`__getitem__`: get sample from dataset with a given index. This
+    method is required by reading dataset sample in :code:`paddle.io.DataLoader`.
+
+    :code:`__len__`: return dataset sample number. This method is required
+    by some implements of :code:`paddle.io.BatchSampler`
+
+    see :code:`paddle.io.DataLoader`.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import numpy as np
+            from paddle.io import Dataset
+            
+            # define a random dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+            
+            dataset = RandomDataset(10)
+            for i in range(len(dataset)):
+                print(dataset[i])
+
+    """
+
+    def __init__(self):
+        pass
+
+    def __getitem__(self, idx):
+        raise NotImplementedError("'{}' not implement in class "\
+                "{}".format('__getitem__', self.__class__.__name__))
+
+    def __len__(self):
+        raise NotImplementedError("'{}' not implement in class "\
+                "{}".format('__len__', self.__class__.__name__))
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@ -37,6 +37,8 @@ from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.log_helper import get_logger
 from . import reader
 from .reader import *
+from . import dataloader
+from .dataloader import *
 from . import core
 from .. import compat as cpt

--- a/python/paddle/fluid/multiprocess_utils.py
+++ b/python/paddle/fluid/multiprocess_utils.py
@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import sys
+import signal
+import atexit
+
+from . import core
+
+# NOTE: queue has a different name in python2 and python3
+if six.PY2:
+    import Queue as queue
+else:
+    import queue
+
+# NOTE: [ mmap files clear ] If there is still data in the multiprocess queue when the main process finishes reading,
+# the data in the queue needs to be popped. Then the LoDTensor read by the main process
+# from the child process will automatically clear the memory-mapped file.
+multiprocess_queue_set = set()
+
+
+def _clear_multiprocess_queue_set():
+    global multiprocess_queue_set
+    for data_queue in multiprocess_queue_set:
+        while True:
+            try:
+                data_queue.get_nowait()
+            except queue.Empty:
+                break
+
+
+# NOTE: main process clear function at exit
+def _cleanup():
+    # NOTE: inter-process Queue shared memory objects clear function
+    _clear_multiprocess_queue_set()
+    # NOTE: main process memory map files clear funciton
+    core._cleanup_mmap_fds()
+
+
+# NOTE: for child process clear function at exit
+def _cleanup_mmap():
+    # clear memory map files in child process
+    core._cleanup_mmap_fds()
+
+
+# NOTE used for register a function to be executed at interpreter exit.
+class CleanupFuncRegistrar():
+    # Record the cleanup functions that have been executed
+    _executed_func_set = set()
+    # Record the cleanup functions that have been registered
+    _registered_func_set = set()
+
+    @classmethod
+    def register(cls, function, signals=[]):
+        def _func_exectuor():
+            if function not in cls._executed_func_set:
+                try:
+                    function()
+                finally:
+                    cls._executed_func_set.add(function)
+
+        def _func_register(function):
+            if not callable(function):
+                raise TypeError("%s is not callable object." % (function))
+            # check function object whether hash-able
+            set([function])
+            if function not in cls._registered_func_set:
+                atexit.register(_func_exectuor)
+                cls._registered_func_set.add(function)
+
+        def _signal_handler(signum=None, frame=None):
+            _func_exectuor()
+            if signum is not None:
+                if signum == signal.SIGINT:
+                    raise KeyboardInterrupt
+                sys.exit(signum)
+
+        def _signal_register(signals):
+            signals = set(signals)
+            for sig in signals:
+                orig_handler = signal.signal(sig, _signal_handler)
+                if orig_handler not in (signal.SIG_DFL, signal.SIG_IGN):
+                    if (sig == signal.SIGINT and
+                            orig_handler is signal.default_int_handler):
+                        continue
+                    if orig_handler not in cls._registered_func_set:
+                        atexit.register(orig_handler)
+                        cls._registered_func_set.add(orig_handler)
+
+        # deal with signals
+        _signal_register(signals)
+        # deal with function
+        _func_register(function)
+
+
+# NOTE: [ mmap files clear ] When the main process exits unexpectedly, the remaining
+# shared memory objects in the inter-process Queue and the main process (mostly in the
+# BlockingQueue) may not be completely released, resulting in the corresponding
+# memory-mapped file remaining on the disk (/dev/shm), so register this function
+# to clean up shared memory objects in these two queues before the python interpreter exits.
+# NOTE: Currently multi-process DataLoader only supports Linux platform
+if not (sys.platform == 'darwin' or sys.platform == 'win32'):
+    CleanupFuncRegistrar.register(_cleanup)
+
+# ------------ SIGCHLD handler setting --------------
+_SIGCHLD_handler_set = False
+
+
+def _set_SIGCHLD_handler():
+    global _SIGCHLD_handler_set
+    if _SIGCHLD_handler_set:
+        return
+
+    current_handler = signal.getsignal(signal.SIGCHLD)
+    if not callable(current_handler):
+        current_handler = None
+
+    def __handler__(signum, frame):
+        # NOTE: Here the signum is SIGCHLD, when the child process exits,
+        # this handler will be called whenever the child process exits
+        # normally or abnormally.
+        core._throw_error_if_process_failed()
+        if current_handler is not None:
+            current_handler(signum, frame)
+
+    signal.signal(signal.SIGCHLD, __handler__)
+    _SIGCHLD_handler_set = True
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -211,6 +211,8 @@ if (APPLE OR WIN32)
  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_fds_clear)
  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exit_func)
  list(REMOVE_ITEM TEST_OPS test_imperative_signal_handler)
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_base)
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
 endif()

 if(NOT WITH_GPU OR WIN32 OR APPLE)
@ -381,4 +383,6 @@ if(NOT WIN32 AND NOT APPLE)
    set_tests_properties(test_imperative_data_loader_base PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    set_tests_properties(test_imperative_data_loader_fds_clear PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    # set_tests_properties(test_imperative_data_loader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
+    set_tests_properties(test_multiprocess_dataloader_base PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
+    set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
 endif()
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+
+import paddle.fluid as fluid
+from paddle.io import BatchSampler, Dataset
+
+
+class RandomDataset(Dataset):
+    def __init__(self, sample_num, class_num):
+        self.sample_num = sample_num
+        self.class_num = class_num
+
+    def __getitem__(self, idx):
+        np.random.seed(idx)
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.sample_num
+
+
+class TestBatchSampler(unittest.TestCase):
+    def setUp(self):
+        self.num_samples = 1000
+        self.num_classes = 10
+        self.batch_size = 32
+        self.shuffle = False
+        self.drop_last = False
+
+    def init_batch_sampler(self):
+        dataset = RandomDataset(self.num_samples, self.num_classes)
+        bs = BatchSampler(
+            dataset=dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            drop_last=self.drop_last)
+        return bs
+
+    def test_main(self):
+        bs = self.init_batch_sampler()
+        # length check
+        bs_len = (self.num_samples + int(not self.drop_last) \
+                * (self.batch_size - 1)) // self.batch_size
+        self.assertTrue(bs_len == len(bs))
+
+        # output indices check
+        if not self.shuffle:
+            index = 0
+            for indices in bs:
+                for idx in indices:
+                    self.assertTrue(index == idx)
+                    index += 1
+
+
+class TestBatchSamplerDropLast(TestBatchSampler):
+    def setUp(self):
+        self.num_samples = 1000
+        self.num_classes = 10
+        self.batch_size = 32
+        self.shuffle = False
+        self.drop_last = True
+
+
+class TestBatchSamplerShuffle(TestBatchSampler):
+    def setUp(self):
+        self.num_samples = 1000
+        self.num_classes = 10
+        self.batch_size = 32
+        self.shuffle = True
+        self.drop_last = True
+
+
+class TestBatchSamplerWithIndices(TestBatchSampler):
+    def init_batch_sampler(self):
+        bs = BatchSampler(
+            indices=list(range(self.num_samples)),
+            batch_size=self.batch_size,
+            drop_last=self.drop_last)
+        return bs
+
+
+class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
+    def setUp(self):
+        self.num_samples = 1000
+        self.num_classes = 10
+        self.batch_size = 32
+        self.shuffle = False
+        self.drop_last = True
+
+    def test_main(self):
+        try:
+            dataset = RandomDataset(self.num_samples, self.num_classes)
+            bs = BatchSampler(
+                dataset=dataset,
+                indices=list(range(self.num_samples)),
+                batch_size=self.batch_size,
+                drop_last=self.drop_last)
+            self.assertTrue(False)
+        except AssertionError:
+            pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import *
+
+
+class TestDatasetAbstract(unittest.TestCase):
+    def test_main(self):
+        dataset = Dataset()
+        try:
+            d = dataset[0]
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+        try:
+            l = len(dataset)
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
@ -17,6 +17,7 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.io import Dataset, DataLoader


 def get_random_images_and_labels(image_shape, label_shape):
@ -35,6 +36,20 @@ def batch_generator_creator(batch_size, batch_num):
    return __reader__


+class RandomDataset(Dataset):
+    def __init__(self, sample_num):
+        self.sample_num = sample_num
+
+    def __getitem__(self, idx):
+        np.random.seed(idx)
+        image = np.random.random([784]).astype('float32')
+        label = np.random.randint(0, 9, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.sample_num
+
+
 class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
    def setUp(self):
        self.batch_size = 8
@ -74,5 +89,19 @@ class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
                self.run_one_epoch_with_break(loader)


+class TestMultiProcessDataLoaderMmapFdsClear(TestDygraphDataLoaderMmapFdsClear):
+    def prepare_data_loader(self):
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            dataset = RandomDataset(self.batch_size * self.batch_num)
+            loader = DataLoader(
+                dataset,
+                places=place,
+                batch_size=self.batch_size,
+                drop_last=True,
+                num_workers=2)
+            return loader
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@ -24,7 +24,7 @@ from paddle.fluid import core


 def set_child_signal_handler(self, child_pid):
-    core._set_process_pid(id(self), child_pid)
+    core._set_process_pids(id(self), tuple([child_pid]))
    current_handler = signal.getsignal(signal.SIGCHLD)
    if not callable(current_handler):
        current_handler = None
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_base.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_base.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@ -0,0 +1,199 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import os
+import sys
+import six
+import time
+import unittest
+import multiprocessing
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import Dataset, BatchSampler, DataLoader
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.base import to_variable
+
+
+class RandomDataset(Dataset):
+    def __init__(self, sample_num):
+        self.sample_num = sample_num
+
+    def __getitem__(self, idx):
+        np.random.seed(idx)
+        image = np.random.random([784]).astype('float32')
+        label = np.random.randint(0, 9, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.sample_num
+
+
+class TestDataLoaderAssert(unittest.TestCase):
+    def test_main(self):
+        place = fluid.cpu_places()[0]
+        with fluid.dygraph.guard(place):
+            dataset = RandomDataset(100)
+            batch_sampler = BatchSampler(dataset=dataset, batch_size=4)
+
+            # dataset is not instance of Dataset
+            try:
+                loader = DataLoader(dataset=batch_sampler, places=place)
+                self.assertTrue(False)
+            except AssertionError:
+                pass
+
+            # places is None
+            try:
+                loader = DataLoader(dataset=dataset, places=None)
+                self.assertTrue(False)
+            except AssertionError:
+                pass
+
+            # num_workers < 0
+            try:
+                loader = DataLoader(
+                    dataset=dataset, places=place, num_workers=-1)
+                self.assertTrue(False)
+            except AssertionError:
+                pass
+
+            # timeout < 0
+            try:
+                loader = DataLoader(dataset=dataset, places=place, timeout=-1)
+                self.assertTrue(False)
+            except AssertionError:
+                pass
+
+            # batch_sampler is not instance of BatchSampler
+            try:
+                loader = DataLoader(
+                    dataset=dataset, places=place, batch_sampler=dataset)
+                self.assertTrue(False)
+            except AssertionError:
+                pass
+
+            # set batch_sampler and shuffle/batch_size/drop_last
+            try:
+                loader = DataLoader(
+                    dataset=dataset,
+                    places=place,
+                    batch_sampler=batch_sampler,
+                    shuffle=True,
+                    drop_last=True)
+                self.assertTrue(False)
+            except AssertionError:
+                pass
+
+            # set batch_sampler correctly
+            try:
+                loader = DataLoader(
+                    dataset=dataset, places=place, batch_sampler=batch_sampler)
+                self.assertTrue(True)
+            except AssertionError:
+                self.assertTrue(False)
+
+
+# CI Converage cannot record stub in subprocess,
+# HACK a _worker_loop in main process call here
+class TestDataLoaderWorkerLoop(unittest.TestCase):
+    def run_without_worker_done(self, use_shared_memory=True):
+        try:
+            place = fluid.cpu_places()[0]
+            with fluid.dygraph.guard(place):
+                dataset = RandomDataset(800)
+
+                # test init_fn
+                def _init_fn(worker_id):
+                    pass
+
+                # test collate_fn
+                def _collate_fn(sample_list):
+                    return [
+                        np.stack(
+                            s, axis=0) for s in list(zip(*sample_list))
+                    ]
+
+                loader = DataLoader(
+                    dataset,
+                    num_workers=1,
+                    places=place,
+                    use_shared_memory=use_shared_memory)
+                assert loader.num_workers > 0, \
+                    "go to AssertionError and pass in Mac and Windows"
+                loader = iter(loader)
+                print("loader length", len(loader))
+                indices_queue = multiprocessing.Queue()
+                for i in range(10):
+                    indices_queue.put([i, i + 10])
+                indices_queue.put(None)
+                loader._worker_loop(
+                    loader._dataset, indices_queue, loader._data_queue,
+                    loader._workers_done_event, _collate_fn, _init_fn, 0)
+                self.assertTrue(False)
+        except AssertionError:
+            pass
+        except Exception:
+            self.assertTrue(False)
+
+    def run_with_worker_done(self, use_shared_memory=True):
+        try:
+            place = fluid.cpu_places()[0]
+            with fluid.dygraph.guard(place):
+                dataset = RandomDataset(800)
+
+                # test init_fn
+                def _init_fn(worker_id):
+                    pass
+
+                # test collate_fn
+                def _collate_fn(sample_list):
+                    return [
+                        np.stack(
+                            s, axis=0) for s in list(zip(*sample_list))
+                    ]
+
+                loader = DataLoader(
+                    dataset,
+                    num_workers=1,
+                    places=place,
+                    use_shared_memory=use_shared_memory)
+                assert loader.num_workers > 0, \
+                    "go to AssertionError and pass in Mac and Windows"
+                loader = iter(loader)
+                print("loader length", len(loader))
+                indices_queue = multiprocessing.Queue()
+                for i in range(10):
+                    indices_queue.put([i, i + 10])
+                indices_queue.put(None)
+                loader._workers_done_event.set()
+                loader._worker_loop(
+                    loader._dataset, indices_queue, loader._data_queue,
+                    loader._workers_done_event, _collate_fn, _init_fn, 0)
+                self.assertTrue(True)
+        except AssertionError:
+            pass
+        except Exception:
+            self.assertTrue(False)
+
+    def test_main(self):
+        for use_shared_memory in [True, False]:
+            self.run_without_worker_done(use_shared_memory)
+            self.run_with_worker_done(use_shared_memory)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/io/init.py
+++ b/python/paddle/io/init.py
@ -13,22 +13,27 @@
 # limitations under the License.

 # TODO: define all functions about input & output in this directory 
-# __all__ = ['Dataset',
-#            'Sampler',
-#            'Transform',
-#            'DataLoader',
-#            'load',
-#            'save',
-#            'load_program_state',
-#            'set_program_state',
-#            'load_inference_model',
-#            'save_inference_model',
-#            'batch',
-#            'shuffle',
-#            'buffered',
-#            'cache',
-#            'chain',
-#            'firstn',
-#            'compose',
-#            'map_readers',
-#            'xmap_readers']
+__all__ = [
+    'Dataset',
+    'BatchSampler',
+    #            'Transform',
+    'DataLoader',
+    #            'load',
+    #            'save',
+    #            'load_program_state',
+    #            'set_program_state',
+    #            'load_inference_model',
+    #            'save_inference_model',
+    #            'batch',
+    #            'shuffle',
+    #            'buffered',
+    #            'cache',
+    #            'chain',
+    #            'firstn',
+    #            'compose',
+    #            'map_readers',
+    #            'xmap_readers'
+]
+
+from ..fluid.io import DataLoader
+from ..fluid.dataloader import Dataset, BatchSampler
--- a/python/setup.py.in
+++ b/python/setup.py.in
@ -149,6 +149,7 @@ packages=['paddle',
          'paddle.fluid.proto.profiler',
          'paddle.fluid.distributed',
          'paddle.fluid.layers',
+          'paddle.fluid.dataloader',
          'paddle.fluid.contrib',
          'paddle.fluid.contrib.decoder',
          'paddle.fluid.contrib.quantize',
@ -176,6 +177,7 @@ packages=['paddle',
          'paddle.fluid.incubate.fleet.parameter_server.pslib',
          'paddle.fluid.incubate.fleet.collective',
          'paddle.fluid.incubate.fleet.utils',
+          'paddle.io',
          'paddle.nn',
          'paddle.nn.functional',
          'paddle.nn.layer',