Paddle/python/paddle/fluid/trainer_factory.py

#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defination of TrainerFactory."""

import threading
import time

import numpy as np

from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer
from .device_worker import Hogwild, DownpourSGD, Section

__all__ = ["TrainerFactory", "FetchHandler", "FetchHandlerMonitor"]


class TrainerFactory(object):
    """
    Create trainer and device worker.
    If opt_info is not None, it will get configs from opt_info,
    otherwise create MultiTrainer and Hogwild.
    """

    def __init__(self):
        pass

    def _create_trainer(self, opt_info=None):
        trainer = None
        device_worker = None
        if opt_info == None:
            # default is MultiTrainer + Hogwild
            trainer = MultiTrainer()
            device_worker = Hogwild()
            trainer._set_device_worker(device_worker)
        else:
            trainer_class = opt_info["trainer"]
            device_worker_class = opt_info["device_worker"]
            trainer = globals()[trainer_class]()
            device_worker = globals()[device_worker_class]()
            if "fleet_desc" in opt_info:
                device_worker._set_fleet_desc(opt_info["fleet_desc"])
                trainer._set_fleet_desc(opt_info["fleet_desc"])
                if opt_info.get("use_cvm") is not None:
                    trainer._set_use_cvm(opt_info["use_cvm"])
                if opt_info.get("no_cvm") is not None:
                    trainer._set_no_cvm(opt_info["no_cvm"])
                if opt_info.get("scale_datanorm") is not None:
                    trainer._set_scale_datanorm(opt_info["scale_datanorm"])
                if opt_info.get("dump_slot") is not None:
                    trainer._set_dump_slot(opt_info["dump_slot"])
                if opt_info.get("mpi_rank") is not None:
                    trainer._set_mpi_rank(opt_info["mpi_rank"])
                if opt_info.get("mpi_size") is not None:
                    trainer._set_mpi_size(opt_info["mpi_size"])
                if opt_info.get("dump_fields") is not None:
                    trainer._set_dump_fields(opt_info["dump_fields"])
                if opt_info.get("dump_fields_path") is not None:
                    trainer._set_dump_fields_path(opt_info["dump_fields_path"])
                if opt_info.get("dump_file_num") is not None:
                    trainer._set_dump_file_num(opt_info["dump_file_num"])
                if opt_info.get("dump_converter") is not None:
                    trainer._set_dump_converter(opt_info["dump_converter"])
                if opt_info.get("adjust_ins_weight") is not None:
                    trainer._set_adjust_ins_weight(opt_info[
                        "adjust_ins_weight"])
                if opt_info.get("copy_table") is not None:
                    trainer._set_copy_table_config(opt_info["copy_table"])
                if opt_info.get("check_nan_var_names") is not None:
                    trainer._set_check_nan_var_names(opt_info[
                        "check_nan_var_names"])
                if opt_info.get("dump_param") is not None:
                    trainer._set_dump_param(opt_info["dump_param"])
            trainer._set_device_worker(device_worker)
        return trainer


class FetchHandlerMonitor(object):
    """
    Defination of FetchHandlerMonitor class,
    it's for fetch handler.
    """

    def __init__(self, scope, handler):
        self.fetch_instance = handler
        self.fetch_thread = threading.Thread(
            target=self.handler_decorator,
            args=(scope, self.fetch_instance.handler))
        self.running = False

    def start(self):
        """
        start monitor,
        it will start a monitor thread.
        """
        self.running = True
        self.fetch_thread.setDaemon(True)
        self.fetch_thread.start()

    def handler_decorator(self, fetch_scope, fetch_handler):
        """
        decorator of handler,
        Args:
            fetch_scope(Scope): fetch scope
            fetch_handler(Handler): fetch handler
        """
        fetch_target_names = self.fetch_instance.fetch_target_names
        period_secs = self.fetch_instance.period_secs

        elapsed_secs = 0
        while True:
            while self.running and elapsed_secs >= period_secs:
                elapsed_secs = 0

                fetch_vars = [
                    fetch_scope.find_var(varname)
                    for varname in fetch_target_names
                ]

                if None in fetch_vars:
                    continue

                fetch_tensors = [var.get_tensor() for var in fetch_vars]

                if self.fetch_instance.return_np:
                    fetch_nps = []

                    for tensor in fetch_tensors:
                        lod = tensor.lod()

                        if len(lod) > 0:
                            raise RuntimeError(
                                "Some of your fetched tensors hold LoD information. \
                        They can not be completely cast to Python ndarray. We can not \
                        return LoDTensor itself directly, please choose another targets"
                            )

                        if tensor._is_initialized():
                            fetch_nps.append(np.array(tensor))
                        else:
                            fetch_nps.append(None)

                    fetch_handler(fetch_nps)
                else:
                    fetch_handler(fetch_tensors)
            else:
                time.sleep(1)
                elapsed_secs += 1

    def stop(self):
        self.running = False
make Dataset* as an argument 6 years ago			`# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
add copy table (#21086) * copy some feasigns and corresponding embeddings from one sparse table to another * copy all feasigns and corresponding embeddings from one sparse table to another * copy all dense params from one table to another * copy some local vars to other local vars 5 years ago			`"""Defination of TrainerFactory."""`
make Dataset* as an argument 6 years ago
trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago			`import threading`
			`import time`

			`import numpy as np`

Pipeline Concurrency (#17402) Add Pipeline Concurrency Train Mode: - Cpp: pipeline_trainer & section_worker - Python: PipelineOptimizer - Add a new data_feed type: PrivateInstantDataFeed - Add a test demo of pipeline trainer and the test model is gnn - Do not support win32 now 6 years ago			`from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer`
			`from .device_worker import Hogwild, DownpourSGD, Section`
fix runtime error test=develop 6 years ago
trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago			`__all__ = ["TrainerFactory", "FetchHandler", "FetchHandlerMonitor"]`
make Dataset* as an argument 6 years ago

			`class TrainerFactory(object):`
add copy table (#21086) * copy some feasigns and corresponding embeddings from one sparse table to another * copy all feasigns and corresponding embeddings from one sparse table to another * copy all dense params from one table to another * copy some local vars to other local vars 5 years ago			`"""`
			`Create trainer and device worker.`
			`If opt_info is not None, it will get configs from opt_info,`
			`otherwise create MultiTrainer and Hogwild.`
			`"""`

make Dataset* as an argument 6 years ago			`def __init__(self):`
			`pass`

refine document of python API, make device_worker and trainer's API private test=develop 6 years ago			`def _create_trainer(self, opt_info=None):`
refactor downpour optimization 6 years ago			`trainer = None`
			`device_worker = None`
make Dataset* as an argument 6 years ago			`if opt_info == None:`
refactor downpour optimization 6 years ago			`# default is MultiTrainer + Hogwild`
			`trainer = MultiTrainer()`
			`device_worker = Hogwild()`
add data_generator package into setup.py 6 years ago			`trainer._set_device_worker(device_worker)`
make Dataset* as an argument 6 years ago			`else:`
refactor downpour optimization 6 years ago			`trainer_class = opt_info["trainer"]`
			`device_worker_class = opt_info["device_worker"]`
			`trainer = globals()[trainer_class]()`
			`device_worker = globals()[device_worker_class]()`
Pipeline Concurrency (#17402) Add Pipeline Concurrency Train Mode: - Cpp: pipeline_trainer & section_worker - Python: PipelineOptimizer - Add a new data_feed type: PrivateInstantDataFeed - Add a test demo of pipeline trainer and the test model is gnn - Do not support win32 now 6 years ago			`if "fleet_desc" in opt_info:`
			`device_worker._set_fleet_desc(opt_info["fleet_desc"])`
			`trainer._set_fleet_desc(opt_info["fleet_desc"])`
add copy table (#21086) * copy some feasigns and corresponding embeddings from one sparse table to another * copy all feasigns and corresponding embeddings from one sparse table to another * copy all dense params from one table to another * copy some local vars to other local vars 5 years ago			`if opt_info.get("use_cvm") is not None:`
			`trainer._set_use_cvm(opt_info["use_cvm"])`
support general embedding params (#21217) * general table * add sparse table test=develop * no cvm test=develop * add no_cvm test=develop * add note test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * add key of optimizer test=develop 5 years ago			`if opt_info.get("no_cvm") is not None:`
			`trainer._set_no_cvm(opt_info["no_cvm"])`
add copy table (#21086) * copy some feasigns and corresponding embeddings from one sparse table to another * copy all feasigns and corresponding embeddings from one sparse table to another * copy all dense params from one table to another * copy some local vars to other local vars 5 years ago			`if opt_info.get("scale_datanorm") is not None:`
			`trainer._set_scale_datanorm(opt_info["scale_datanorm"])`
			`if opt_info.get("dump_slot") is not None:`
			`trainer._set_dump_slot(opt_info["dump_slot"])`
			`if opt_info.get("mpi_rank") is not None:`
			`trainer._set_mpi_rank(opt_info["mpi_rank"])`
			`if opt_info.get("mpi_size") is not None:`
			`trainer._set_mpi_size(opt_info["mpi_size"])`
			`if opt_info.get("dump_fields") is not None:`
			`trainer._set_dump_fields(opt_info["dump_fields"])`
			`if opt_info.get("dump_fields_path") is not None:`
			`trainer._set_dump_fields_path(opt_info["dump_fields_path"])`
			`if opt_info.get("dump_file_num") is not None:`
			`trainer._set_dump_file_num(opt_info["dump_file_num"])`
			`if opt_info.get("dump_converter") is not None:`
			`trainer._set_dump_converter(opt_info["dump_converter"])`
			`if opt_info.get("adjust_ins_weight") is not None:`
			`trainer._set_adjust_ins_weight(opt_info[`
			`"adjust_ins_weight"])`
			`if opt_info.get("copy_table") is not None:`
			`trainer._set_copy_table_config(opt_info["copy_table"])`
			`if opt_info.get("check_nan_var_names") is not None:`
			`trainer._set_check_nan_var_names(opt_info[`
			`"check_nan_var_names"])`
			`if opt_info.get("dump_param") is not None:`
			`trainer._set_dump_param(opt_info["dump_param"])`
add data_generator package into setup.py 6 years ago			`trainer._set_device_worker(device_worker)`
refactor downpour optimization 6 years ago			`return trainer`
trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago

			`class FetchHandlerMonitor(object):`
add copy table (#21086) * copy some feasigns and corresponding embeddings from one sparse table to another * copy all feasigns and corresponding embeddings from one sparse table to another * copy all dense params from one table to another * copy some local vars to other local vars 5 years ago			`"""`
			`Defination of FetchHandlerMonitor class,`
			`it's for fetch handler.`
			`"""`

trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago			`def __init__(self, scope, handler):`
			`self.fetch_instance = handler`
			`self.fetch_thread = threading.Thread(`
			`target=self.handler_decorator,`
			`args=(scope, self.fetch_instance.handler))`
			`self.running = False`

			`def start(self):`
add copy table (#21086) * copy some feasigns and corresponding embeddings from one sparse table to another * copy all feasigns and corresponding embeddings from one sparse table to another * copy all dense params from one table to another * copy some local vars to other local vars 5 years ago			`"""`
			`start monitor,`
			`it will start a monitor thread.`
			`"""`
trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago			`self.running = True`
			`self.fetch_thread.setDaemon(True)`
			`self.fetch_thread.start()`

			`def handler_decorator(self, fetch_scope, fetch_handler):`
add copy table (#21086) * copy some feasigns and corresponding embeddings from one sparse table to another * copy all feasigns and corresponding embeddings from one sparse table to another * copy all dense params from one table to another * copy some local vars to other local vars 5 years ago			`"""`
			`decorator of handler,`
			`Args:`
			`fetch_scope(Scope): fetch scope`
			`fetch_handler(Handler): fetch handler`
			`"""`
trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago			`fetch_target_names = self.fetch_instance.fetch_target_names`
			`period_secs = self.fetch_instance.period_secs`

			`elapsed_secs = 0`
			`while True:`
			`while self.running and elapsed_secs >= period_secs:`
			`elapsed_secs = 0`

			`fetch_vars = [`
			`fetch_scope.find_var(varname)`
			`for varname in fetch_target_names`
			`]`

Fix communicator slow bug & fix communicator stop bug (#20366) * test=develop,Fix communicator slow bug * test=develop, delete if() in stop_worker() * test=develop * fix UT, test=develop * fix bug in fetch handler, test=develop * fix bug in fetch handler, test=develop * test=develop, fix fetch barrier bug * test=develop, bug fix * test=develop, bug fix * test=develop, fix bug 5 years ago			`if None in fetch_vars:`
			`continue`

trainer from dataset fetch targets (#19760) add executor.FetchHandler for train/infer from the dataset 5 years ago			`fetch_tensors = [var.get_tensor() for var in fetch_vars]`

			`if self.fetch_instance.return_np:`
			`fetch_nps = []`

			`for tensor in fetch_tensors:`
			`lod = tensor.lod()`

			`if len(lod) > 0:`
			`raise RuntimeError(`
			`"Some of your fetched tensors hold LoD information. \`
			`They can not be completely cast to Python ndarray. We can not \`
			`return LoDTensor itself directly, please choose another targets"`
			`)`

			`if tensor._is_initialized():`
			`fetch_nps.append(np.array(tensor))`
			`else:`
			`fetch_nps.append(None)`

			`fetch_handler(fetch_nps)`
			`else:`
			`fetch_handler(fetch_tensors)`
			`else:`
			`time.sleep(1)`
			`elapsed_secs += 1`

			`def stop(self):`
			`self.running = False`