From b044724db710dabf5bea23195b599eab3ba46bb3 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 22 May 2018 14:35:02 +0800 Subject: [PATCH 01/54] update fluid Train API param_path to checkpoint_config --- python/paddle/fluid/trainer.py | 50 +++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 7da123dd92..01c40bb90e 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -27,11 +27,8 @@ import parallel_executor from transpiler import distribute_transpiler __all__ = [ - 'Trainer', - 'BeginEpochEvent', - 'EndEpochEvent', - 'BeginStepEvent', - 'EndStepEvent', + 'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent', + 'EndStepEvent', 'CheckpointConfig' ] @@ -59,6 +56,17 @@ class EndStepEvent(object): self.metrics = metrics +class CheckpointConfig(object): + def __init__(self, + checkpoint_dir=None, + max_num_checkpoints=3, + save_interval_secs=600): + if checkpoint_dir is None: + self.checkpoint_dir = os.getcwd() + self.max_num_checkpoints = max_num_checkpoints + self.save_interval_secs = save_interval_secs + + def check_and_get_place(place): """ Check the type of place or get the default place @@ -97,9 +105,9 @@ class Trainer(object): def __init__(self, train_func, optimizer, - param_path=None, place=None, - parallel=False): + parallel=False, + checkpoint_config=None): self.__stop = False self.parallel = parallel # 1. we need to generate a framework.Program by calling @@ -108,6 +116,16 @@ class Trainer(object): if not isinstance(optimizer, opt_module.Optimizer): raise TypeError("The optimizer should be an instance of Optimizer") + # config for checkpoint + # only chief worker will save variables + self.chief = True + self.checkpoint = checkpoint_config + if self.checkpoint and not isinstance(self.checkpoint, + CheckpointConfig): + raise TypeError( + "The checkpoint_config shoule be an instance of CheckpointConfig" + ) + self.scope = core.Scope() self.startup_program = framework.Program() @@ -136,9 +154,10 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if param_path: - # load params from param_path into scope - io.load_persistables(exe, dirname=param_path) + if self.checkpoint: + exe = executor.Executor(place) + io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, + self.startup_program) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS @@ -146,6 +165,7 @@ class Trainer(object): self.nccl_id_var = None else: self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + self.chief = self.trainer_id == 0 port = os.getenv("PADDLE_PSERVER_PORT") worker_ips = os.getenv("PADDLE_TRAINER_IPS") worker_endpoints = [] @@ -194,6 +214,7 @@ class Trainer(object): # the unique trainer id, starting from 0, needed by trainer # only trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + self.chief = self.trainer_id == 0 # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") with self._prog_and_scope_guard(): @@ -263,6 +284,14 @@ class Trainer(object): exe = executor.Executor(self.place) io.save_persistables(exe, dirname=param_path) + def _save_checkpoint(self): + if self.checkpoint and self.chief: + exe = executor.Executor(self.place) + io.save_checkpoint(exe, self.checkpoint.checkpoint_dir, + self.checkpoint.max_num_checkpoints, + self.checkpoint.save_interval_secs, + self.train_program) + @contextlib.contextmanager def _prog_and_scope_guard(self): with framework.program_guard( @@ -309,6 +338,7 @@ class Trainer(object): else: metrics = exe.run(feed=data, fetch_list=[]) event_handler(EndStepEvent(epoch_id, step_id, metrics)) + self._save_checkpoint() event_handler(EndEpochEvent(epoch_id)) def _test_by_executor(self, reader, feed_order, fetch_list): From dca0b6d9ccc5b770e78a0903839f2ed89d79be58 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 23 May 2018 19:50:25 +0800 Subject: [PATCH 02/54] restore param_path --- python/paddle/fluid/trainer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 01c40bb90e..24254b4980 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -105,6 +105,7 @@ class Trainer(object): def __init__(self, train_func, optimizer, + param_path=None, place=None, parallel=False, checkpoint_config=None): @@ -120,8 +121,8 @@ class Trainer(object): # only chief worker will save variables self.chief = True self.checkpoint = checkpoint_config - if self.checkpoint and not isinstance(self.checkpoint, - CheckpointConfig): + if self.checkpoint and \ + not isinstance(self.checkpoint, CheckpointConfig): raise TypeError( "The checkpoint_config shoule be an instance of CheckpointConfig" ) @@ -159,6 +160,10 @@ class Trainer(object): io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, self.startup_program) + if param_path: + # load params from param_path into scope + io.load_persistables(exe, dirname=param_path) + def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS if "PADDLE_TRAINER_IPS" not in os.environ: From 514b2427edbd30013ca1783769af18fb96ffb626 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 28 May 2018 20:08:23 +0800 Subject: [PATCH 03/54] add save/load persist_vars_without_grad --- python/paddle/fluid/io.py | 46 +++++++++++++++++++++++----------- python/paddle/fluid/trainer.py | 3 ++- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 8e58e5eb79..f626039363 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -24,7 +24,8 @@ __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'load_persistables', 'save_inference_model', 'load_inference_model', 'get_inference_program', 'save_checkpoint', 'load_checkpoint', - 'clean_checkpoint' + 'clean_checkpoint', 'load_persist_vars_without_grad', + 'save_persist_vars_without_grad' ] @@ -455,6 +456,33 @@ def get_parameter_value_by_name(name, executor, program=None): return get_parameter_value(var, executor) +def load_persist_vars_without_grad(executor, dirname, program): + """ + load_persist_vars_without_grad will load variables from a directory by an executor, + the variable named end with "@GRAD" will not be loaded. + """ + load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var, + filename=None) + + +def save_persist_vars_without_grad(executor, dirname, program): + """ + save_persist_vars_without_grad will save variables to a directory by an executor, + the variable named end with "@GRAD" will not be saved. + """ + save_vars( + executor, + dirname=dirname, + main_program=program, + vars=None, + predicate=_is_checkpoint_var, + filename=None) + + SUCCESS_MARK_FILENAME = "_SUCCESS" CHECKPOINT_PREFIX = "checkpoint" CHECKPOINT_SEPARATOR = "_" @@ -491,13 +519,7 @@ def save_checkpoint(executor, serial += 1 cur_dir = _get_serial_dir(serial, checkpoint_dir) - save_vars( - executor, - dirname=cur_dir, - main_program=main_program, - vars=None, - predicate=_is_checkpoint_var, - filename=None) + load_persist_vars_without_grad(executor, cur_dir, main_program) _write_success(cur_dir) _lru_delete(checkpoint_dir, max_num_checkpoints) @@ -521,13 +543,7 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None): return cur_dir = _get_serial_dir(serial, checkpoint_dir) - - load_vars( - executor, - dirname=cur_dir, - main_program=main_program, - predicate=_is_checkpoint_var, - filename=None) + load_persist_vars_without_grad(executor, cur_dir, main_program) def clean_checkpoint(checkpoint_dir, delete_dir=False): diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 24254b4980..b4b7b75b96 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -162,7 +162,8 @@ class Trainer(object): if param_path: # load params from param_path into scope - io.load_persistables(exe, dirname=param_path) + io.load_persist_vars_without_grad( + exe, dirname=param_path, program=self.startup_program) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS From 5eea5db95fb6eaca2db9a0af63e871a9fc29c6bf Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 29 May 2018 14:37:59 +0800 Subject: [PATCH 04/54] optimized checkpoint and save_model --- python/paddle/fluid/__init__.py | 1 + python/paddle/fluid/io.py | 61 +++++++++++++++------------------ python/paddle/fluid/trainer.py | 40 +++++++++++++++------ 3 files changed, 58 insertions(+), 44 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 859605d005..aece8fc149 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -26,6 +26,7 @@ from trainer import BeginEpochEvent from trainer import EndEpochEvent from trainer import BeginStepEvent from trainer import EndStepEvent +from trainer import CheckpointConfig import inferencer from inferencer import Inferencer diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index f626039363..aa039bdfaa 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -491,7 +491,6 @@ CHECKPOINT_SEPARATOR = "_" def save_checkpoint(executor, checkpoint_dir=None, max_num_checkpoints=3, - save_interval_secs=600, main_program=None): """ Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, @@ -511,15 +510,10 @@ def save_checkpoint(executor, if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) - serial = _get_lastest_checkpoint_dir(checkpoint_dir) - if serial >= 0 and not _interval_secs_exceed( - _get_serial_dir(serial, checkpoint_dir), save_interval_secs): - return - - serial += 1 - cur_dir = _get_serial_dir(serial, checkpoint_dir) + serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1 + cur_dir = _get_serial_dir(checkpoint_dir, serial) - load_persist_vars_without_grad(executor, cur_dir, main_program) + save_persist_vars_without_grad(executor, cur_dir, main_program) _write_success(cur_dir) _lru_delete(checkpoint_dir, max_num_checkpoints) @@ -542,7 +536,7 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None): if serial < 0: return - cur_dir = _get_serial_dir(serial, checkpoint_dir) + cur_dir = _get_serial_dir(checkpoint_dir, serial) load_persist_vars_without_grad(executor, cur_dir, main_program) @@ -559,11 +553,6 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): os.rmdir(checkpoint_dir) -def _get_serial_dir(serial, checkpoint_dir): - serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) - return os.path.join(checkpoint_dir, serial_folder) - - def _is_checkpoint_var(var): """ the checkpoint will not save or load all the variables. @@ -582,29 +571,37 @@ def _is_checkpoint_var(var): return var.persistable -def _interval_secs_exceed(dirname, save_interval_secs): - dir_time = os.path.getmtime(dirname) - if save_interval_secs > (time.time() - dir_time): - return False - return True +def _get_dir_serial(dirname): + _, serial = dirname.split(CHECKPOINT_SEPARATOR) + + serial_num = -1 + try: + serial_num = int(serial) + except ValueError: + serial_num = -1 + return serial_num + + +def _get_serial_dir(dirname, serial): + serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) + return os.path.join(dirname, serial_folder) def _lru_delete(dirname, max_num_checkpoints=3): dirs = os.listdir(dirname) - serials = [] + serial_map = {} for serial in dirs: - try: - serials.append(int(serial)) - except ValueError: - continue + serial_num = _get_dir_serial(serial) + serial_map[serial_num] = serial - if len(serials) <= max_num_checkpoints: + if len(serial_map.keys()) <= max_num_checkpoints: return + serials = serial_map.keys() serials.sort(reverse=True) serials = serials[max_num_checkpoints:] for serial in serials: - cur_dir = os.path.join(dirname, str(serial)) + cur_dir = _get_serial_dir(dirname, serial) shutil.rmtree(cur_dir) @@ -633,20 +630,18 @@ def _get_lastest_checkpoint_dir(checkpoint_dir): """ is _SUCCESS in this dir """ - _, serial = cur_dir.split(CHECKPOINT_SEPARATOR) - try: - int(serial) - except ValueError: + serial = _get_dir_serial(cur_dir) + if serial == -1: return -1 if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): return -1 success_path = os.path.join( - _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME) + _get_serial_dir(checkpoint_dir, serial), SUCCESS_MARK_FILENAME) if os.path.isfile(success_path): - return int(serial) + return serial if not os.path.isdir(checkpoint_dir): return -1 diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index b4b7b75b96..3cf96ac251 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -60,11 +60,24 @@ class CheckpointConfig(object): def __init__(self, checkpoint_dir=None, max_num_checkpoints=3, - save_interval_secs=600): + epoch_interval=1, + step_interval=10): if checkpoint_dir is None: self.checkpoint_dir = os.getcwd() + else: + self.checkpoint_dir = checkpoint_dir + self.max_num_checkpoints = max_num_checkpoints - self.save_interval_secs = save_interval_secs + + if epoch_interval < 1: + self.epoch_interval = 1 + else: + self.epoch_interval = epoch_interval + + if step_interval < 1: + self.step_interval = 10 + else: + self.step_interval = step_interval def check_and_get_place(place): @@ -290,14 +303,6 @@ class Trainer(object): exe = executor.Executor(self.place) io.save_persistables(exe, dirname=param_path) - def _save_checkpoint(self): - if self.checkpoint and self.chief: - exe = executor.Executor(self.place) - io.save_checkpoint(exe, self.checkpoint.checkpoint_dir, - self.checkpoint.max_num_checkpoints, - self.checkpoint.save_interval_secs, - self.train_program) - @contextlib.contextmanager def _prog_and_scope_guard(self): with framework.program_guard( @@ -343,8 +348,9 @@ class Trainer(object): ]) else: metrics = exe.run(feed=data, fetch_list=[]) + event_handler(EndStepEvent(epoch_id, step_id, metrics)) - self._save_checkpoint() + self._save_checkpoint(epoch_id, step_id) event_handler(EndEpochEvent(epoch_id)) def _test_by_executor(self, reader, feed_order, fetch_list): @@ -384,6 +390,18 @@ class Trainer(object): loss_name=self.train_func_outputs[0].name) return self._get_parallel_executor() + def _save_checkpoint(self, epoch_id, step_id): + if not self.checkpoint or not self.chief: + return + + if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0: + exe = executor.Executor(self.place) + io.save_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint.checkpoint_dir, + max_num_checkpoints=self.checkpoint.max_num_checkpoints, + main_program=self.train_program) + def build_feed_var_list(program, feed_order): if not isinstance(program, framework.Program): From 5f5d6a9dc7eaf2e1c5b069454497d11a28701ddb Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 29 May 2018 16:01:26 +0800 Subject: [PATCH 05/54] optimized checkpoint and save_model --- python/paddle/fluid/io.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index aa039bdfaa..bd3c2e3d9a 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -489,9 +489,9 @@ CHECKPOINT_SEPARATOR = "_" def save_checkpoint(executor, - checkpoint_dir=None, - max_num_checkpoints=3, - main_program=None): + checkpoint_dir, + main_program=None, + max_num_checkpoints=3): """ Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy @@ -500,12 +500,11 @@ def save_checkpoint(executor, :param executor :param checkpoint_dir - :param max_num_checkpoints - :param save_interval_secs :param main_program + :param max_num_checkpoints """ if checkpoint_dir is None: - checkpoint_dir = os.getcwd() + raise ValueError("The values of 'checkpoint_dir' should not be None") if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) @@ -518,7 +517,7 @@ def save_checkpoint(executor, _lru_delete(checkpoint_dir, max_num_checkpoints) -def load_checkpoint(executor, checkpoint_dir=None, main_program=None): +def load_checkpoint(executor, checkpoint_dir, main_program=None): """ Load checkpoint from a directory by executor, it will find the most recent saved checkpoint file and load it auto. @@ -529,7 +528,7 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None): """ if checkpoint_dir is None: - checkpoint_dir = os.getcwd() + raise ValueError("The values of 'checkpoint_dir' should not be None") serial = _get_lastest_checkpoint_dir(checkpoint_dir) @@ -546,7 +545,7 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): delete_dir only works when the directory is empty, otherwise, OSError is raised. """ if checkpoint_dir is None: - checkpoint_dir = os.getcwd() + raise ValueError("The values of 'checkpoint_dir' should not be None") _lru_delete(checkpoint_dir, max_num_checkpoints=0) if delete_dir and not os.listdir(checkpoint_dir): From ad9dfeb0180b40905d245354e733e750009cc173 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 29 May 2018 20:28:40 +0800 Subject: [PATCH 06/54] bug fix and optimize --- python/paddle/fluid/io.py | 153 +++++++++++++++++++++++++-------- python/paddle/fluid/trainer.py | 52 +++++++++-- 2 files changed, 162 insertions(+), 43 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index bd3c2e3d9a..ed560304e2 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -456,40 +456,18 @@ def get_parameter_value_by_name(name, executor, program=None): return get_parameter_value(var, executor) -def load_persist_vars_without_grad(executor, dirname, program): - """ - load_persist_vars_without_grad will load variables from a directory by an executor, - the variable named end with "@GRAD" will not be loaded. - """ - load_vars( - executor, - dirname=dirname, - main_program=program, - predicate=_is_checkpoint_var, - filename=None) - - -def save_persist_vars_without_grad(executor, dirname, program): - """ - save_persist_vars_without_grad will save variables to a directory by an executor, - the variable named end with "@GRAD" will not be saved. - """ - save_vars( - executor, - dirname=dirname, - main_program=program, - vars=None, - predicate=_is_checkpoint_var, - filename=None) - - SUCCESS_MARK_FILENAME = "_SUCCESS" CHECKPOINT_PREFIX = "checkpoint" +MODEL_DIR = "__model__" +TRAINER_PREFIX = "trainer" CHECKPOINT_SEPARATOR = "_" def save_checkpoint(executor, checkpoint_dir, + trainer_id, + is_chief=False, + trainer_args=None, main_program=None, max_num_checkpoints=3): """ @@ -502,22 +480,35 @@ def save_checkpoint(executor, :param checkpoint_dir :param main_program :param max_num_checkpoints + :param is_chief """ if checkpoint_dir is None: raise ValueError("The values of 'checkpoint_dir' should not be None") + if trainer_args and not isinstance(trainer_args, dict): + raise TypeError("The type of 'trainer_args' should be dict") + if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1 cur_dir = _get_serial_dir(checkpoint_dir, serial) - save_persist_vars_without_grad(executor, cur_dir, main_program) - _write_success(cur_dir) + if is_chief: + save_persist_vars_without_grad(executor, cur_dir, main_program) + + save_trainer_args(cur_dir, trainer_id, trainer_args) _lru_delete(checkpoint_dir, max_num_checkpoints) -def load_checkpoint(executor, checkpoint_dir, main_program=None): +def need_load_checkpoint(checkpoint_dir): + serial = _get_lastest_checkpoint_dir(checkpoint_dir) + if serial < 0: + return None + return serial + + +def load_checkpoint(executor, checkpoint_dir, serial, main_program): """ Load checkpoint from a directory by executor, it will find the most recent saved checkpoint file and load it auto. @@ -528,14 +519,17 @@ def load_checkpoint(executor, checkpoint_dir, main_program=None): """ if checkpoint_dir is None: - raise ValueError("The values of 'checkpoint_dir' should not be None") + raise ValueError( + "The values of 'checkpoint_dir' or 'serial' should not be None") - serial = _get_lastest_checkpoint_dir(checkpoint_dir) + if serial is None or serial < 0: + raise ValueError("The values of 'serial' should not be None or <0 ") - if serial < 0: - return + if main_program is None: + raise ValueError("The values of 'main_program'should not be None") cur_dir = _get_serial_dir(checkpoint_dir, serial) + cur_dir = _get_model_dir(cur_dir) load_persist_vars_without_grad(executor, cur_dir, main_program) @@ -552,6 +546,68 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): os.rmdir(checkpoint_dir) +def load_persist_vars_without_grad(executor, dirname, program, nest=True): + """ + load_persist_vars_without_grad will load variables from a directory by an executor, + the variable named end with "@GRAD" will not be loaded. + """ + + if nest: + dirname = _get_model_dir(dirname) + + load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var, + filename=None) + + +def save_persist_vars_without_grad(executor, dirname, program): + """ + save_persist_vars_without_grad will save variables to a directory by an executor, + the variable named end with "@GRAD" will not be saved. + """ + cur_dir = _get_model_dir(dirname) + save_vars( + executor, + dirname=cur_dir, + main_program=program, + vars=None, + predicate=_is_checkpoint_var, + filename=None) + _write_success(cur_dir) + + +def save_trainer_args(dirname, trainer_id, trainer_args): + if not isinstance(trainer_args, dict): + raise TypeError("The type of 'trainer_args' should be dict") + cur_dir = _get_trainer_dir(dirname, trainer_id) + + for name, value in trainer_args.iteritems(): + args_file = os.path.join(cur_dir, name) + with open(args_file, 'w') as f: + f.write(str(value)) + _write_success(cur_dir) + + +def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): + cur_dir = _get_serial_dir(checkpoint_dir, serial) + cur_dir = _get_trainer_dir(cur_dir, trainer_id) + + if not isinstance(trainer_args, list): + raise TypeError("The type of 'trainer_args' should be list") + + ret_values = [] + + for arg in trainer_args: + cur_file = os.path.join(cur_dir, arg) + with open(cur_file, 'r') as f: + contents = f.read() + ret_values.append(contents.strip()) + return ret_values + + def _is_checkpoint_var(var): """ the checkpoint will not save or load all the variables. @@ -583,7 +639,31 @@ def _get_dir_serial(dirname): def _get_serial_dir(dirname, serial): serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) - return os.path.join(dirname, serial_folder) + serial_dir = os.path.join(dirname, serial_folder) + + if not os.path.isdir(serial_dir): + os.makedirs(serial_dir) + + return serial_dir + + +def _get_model_dir(dirname): + model_dir = os.path.join(dirname, MODEL_DIR) + + if not os.path.isdir(model_dir): + os.makedirs(model_dir) + + return model_dir + + +def _get_trainer_dir(dirname, trainer_id): + trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) + trainer_dir = os.path.join(dirname, trainer_folder) + + if not os.path.isdir(trainer_dir): + os.makedirs(trainer_dir) + + return trainer_dir def _lru_delete(dirname, max_num_checkpoints=3): @@ -638,7 +718,8 @@ def _get_lastest_checkpoint_dir(checkpoint_dir): return -1 success_path = os.path.join( - _get_serial_dir(checkpoint_dir, serial), SUCCESS_MARK_FILENAME) + _get_serial_dir(checkpoint_dir, serial), MODEL_DIR, + SUCCESS_MARK_FILENAME) if os.path.isfile(success_path): return serial diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 3cf96ac251..206d582cdc 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -79,6 +79,9 @@ class CheckpointConfig(object): else: self.step_interval = step_interval + self.epoch_id = 0 + self.step_id = 0 + def check_and_get_place(place): """ @@ -132,6 +135,7 @@ class Trainer(object): # config for checkpoint # only chief worker will save variables + self.trainer_id = 0 self.chief = True self.checkpoint = checkpoint_config if self.checkpoint and \ @@ -139,6 +143,8 @@ class Trainer(object): raise TypeError( "The checkpoint_config shoule be an instance of CheckpointConfig" ) + self.load_checkpoint_serial = io.need_load_checkpoint( + self.checkpoint.checkpoint_dir) self.scope = core.Scope() @@ -168,15 +174,25 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if self.checkpoint: + if self.load_checkpoint_serial: exe = executor.Executor(place) io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, + self.load_checkpoint_serial, self.startup_program) - if param_path: + epoch_id, step_id = io.load_trainer_args( + self.checkpoint.checkpoint_dir, self.load_checkpoint_serial, + self.trainer_id, ["epoch_id", "step_id"]) + self.checkpoint.epoch_id = int(epoch_id) + self.checkpoint.step_id = int(step_id) + + if param_path and os.path.isdir(param_path): # load params from param_path into scope io.load_persist_vars_without_grad( - exe, dirname=param_path, program=self.startup_program) + exe, + dirname=param_path, + program=self.startup_program, + nest=False) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS @@ -333,11 +349,20 @@ class Trainer(object): self._train_by_any_executor(event_handler, exe, num_epochs, reader) def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): - for epoch_id in range(num_epochs): + epochs = [ + epoch_id for epoch_id in range(num_epochs) + if epoch_id >= self.checkpoint.epoch_id + ] + for epoch_id in epochs: event_handler(BeginEpochEvent(epoch_id)) for step_id, data in enumerate(reader()): if self.__stop: + self._clean_checkpoint() return + + if self.checkpoint and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id: + continue + begin_event = BeginStepEvent(epoch_id, step_id) event_handler(begin_event) if begin_event.fetch_metrics: @@ -352,6 +377,7 @@ class Trainer(object): event_handler(EndStepEvent(epoch_id, step_id, metrics)) self._save_checkpoint(epoch_id, step_id) event_handler(EndEpochEvent(epoch_id)) + self._clean_checkpoint() def _test_by_executor(self, reader, feed_order, fetch_list): with executor.scope_guard(self.scope): @@ -390,17 +416,29 @@ class Trainer(object): loss_name=self.train_func_outputs[0].name) return self._get_parallel_executor() + def _clean_checkpoint(self): + if not self.checkpoint: + return + io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir) + def _save_checkpoint(self, epoch_id, step_id): - if not self.checkpoint or not self.chief: + if not self.checkpoint: return if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0: + trainer_args = {} + trainer_args["epoch_id"] = epoch_id + trainer_args["step_id"] = step_id + exe = executor.Executor(self.place) io.save_checkpoint( executor=exe, checkpoint_dir=self.checkpoint.checkpoint_dir, - max_num_checkpoints=self.checkpoint.max_num_checkpoints, - main_program=self.train_program) + trainer_id=self.trainer_id, + is_chief=self.chief, + trainer_args=trainer_args, + main_program=self.train_program, + max_num_checkpoints=self.checkpoint.max_num_checkpoints) def build_feed_var_list(program, feed_order): From 486e1e337d05679a22b389840136b9f07714646b Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 29 May 2018 20:36:45 +0800 Subject: [PATCH 07/54] bug fix and optimize --- python/paddle/fluid/trainer.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 206d582cdc..35bb8ded5d 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -79,8 +79,9 @@ class CheckpointConfig(object): else: self.step_interval = step_interval - self.epoch_id = 0 - self.step_id = 0 + self._epoch_id = 0 + self._step_id = 0 + self._load_serial = None def check_and_get_place(place): @@ -174,17 +175,17 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if self.load_checkpoint_serial: + if self.checkpoint._load_serial: exe = executor.Executor(place) io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, - self.load_checkpoint_serial, + self.checkpoint._load_serial, self.startup_program) epoch_id, step_id = io.load_trainer_args( self.checkpoint.checkpoint_dir, self.load_checkpoint_serial, self.trainer_id, ["epoch_id", "step_id"]) - self.checkpoint.epoch_id = int(epoch_id) - self.checkpoint.step_id = int(step_id) + self.checkpoint._epoch_id = int(epoch_id) + self.checkpoint._step_id = int(step_id) if param_path and os.path.isdir(param_path): # load params from param_path into scope @@ -351,7 +352,7 @@ class Trainer(object): def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): epochs = [ epoch_id for epoch_id in range(num_epochs) - if epoch_id >= self.checkpoint.epoch_id + if epoch_id >= self.checkpoint._epoch_id ] for epoch_id in epochs: event_handler(BeginEpochEvent(epoch_id)) @@ -360,7 +361,8 @@ class Trainer(object): self._clean_checkpoint() return - if self.checkpoint and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id: + if self.checkpoint and self.checkpoint._load_serial \ + and self.checkpoint._step_id >= step_id and self.checkpoint._epoch_id == epoch_id: continue begin_event = BeginStepEvent(epoch_id, step_id) From 9086043090f80ee7695d043e84fbe8068b2f76e7 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 29 May 2018 20:52:01 +0800 Subject: [PATCH 08/54] bug fix and optimize --- python/paddle/fluid/io.py | 1 - python/paddle/fluid/trainer.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index ed560304e2..2925e8eb28 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -529,7 +529,6 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program): raise ValueError("The values of 'main_program'should not be None") cur_dir = _get_serial_dir(checkpoint_dir, serial) - cur_dir = _get_model_dir(cur_dir) load_persist_vars_without_grad(executor, cur_dir, main_program) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 35bb8ded5d..5ca93821e2 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -144,7 +144,7 @@ class Trainer(object): raise TypeError( "The checkpoint_config shoule be an instance of CheckpointConfig" ) - self.load_checkpoint_serial = io.need_load_checkpoint( + self.checkpoint._load_serial = io.need_load_checkpoint( self.checkpoint.checkpoint_dir) self.scope = core.Scope() @@ -182,7 +182,7 @@ class Trainer(object): self.startup_program) epoch_id, step_id = io.load_trainer_args( - self.checkpoint.checkpoint_dir, self.load_checkpoint_serial, + self.checkpoint.checkpoint_dir, self.checkpoint._load_serial, self.trainer_id, ["epoch_id", "step_id"]) self.checkpoint._epoch_id = int(epoch_id) self.checkpoint._step_id = int(step_id) From 0211c5df0a12de2647b339dc0a8c36d35209a1a3 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 30 May 2018 11:45:25 +0800 Subject: [PATCH 09/54] bug fix --- python/paddle/fluid/trainer.py | 17 +++++++++-------- tools/codestyle/docstring_checker.pyc | Bin 0 -> 12561 bytes 2 files changed, 9 insertions(+), 8 deletions(-) create mode 100644 tools/codestyle/docstring_checker.pyc diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 5ca93821e2..34db9b39b7 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -139,13 +139,14 @@ class Trainer(object): self.trainer_id = 0 self.chief = True self.checkpoint = checkpoint_config - if self.checkpoint and \ - not isinstance(self.checkpoint, CheckpointConfig): - raise TypeError( - "The checkpoint_config shoule be an instance of CheckpointConfig" - ) - self.checkpoint._load_serial = io.need_load_checkpoint( - self.checkpoint.checkpoint_dir) + if self.checkpoint: + if not isinstance(self.checkpoint, CheckpointConfig): + raise TypeError( + "The checkpoint_config shoule be an instance of CheckpointConfig" + ) + else: + self.checkpoint._load_serial = io.need_load_checkpoint( + self.checkpoint.checkpoint_dir) self.scope = core.Scope() @@ -175,7 +176,7 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if self.checkpoint._load_serial: + if self.checkpoint and self.checkpoint._load_serial: exe = executor.Executor(place) io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, self.checkpoint._load_serial, diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f0255b763c8d154b7cc17d2d525148dfea3b42d GIT binary patch literal 12561 zcmdT~Npl;=7488*kbt;or54N9SSu6>B(+$!8Cn)4%CZuRQVr#(M2iK3m?1gT00T}B zEYg-Lm69vVF}LI&q$*W8rE*K<5Z`=Een4`}DTh=}i7WZO*Mk8_(T-GhiZsb-cE9fF z<@MX&WdAWx{QTCy2Q`)aWbywB9{o!cu2QooE#+oZV5wPDOg*D!Gg8m0Agk&*HJelQ z9yQyeERM*jAg}7ZYPQ$3dsNV;>IF4hFm23MRI^3p=GCWIVux~jmDjHp@M<3+PnEhq zJD}Wx@^S(#rqDs|Wu~-)Tbw{K71ned`HjVEOJ41v7umkHTiSE&IJ9d7+0C_hDQwto zShLN@QyP`Ez#H-ZgP-YA2Ghix@M=|i$uw~f+O60Rbjf1(5Q>|Vct7;rWQghDd#<-o zZ3VIG*WwyS^C2Fu;nDk1JXUHMlUS@gwfILg&B)4nHwHM~qOW7@UbgH-dL-G_PIN{| zXC>*33EX&Yvzt760>6QoN~kF*^a^UD?`zCD8dn>Ok34_$uK9oN{FV8t@3uy2VSRMb zk4G27Q61G-U24q>^iI`v1J9JB$-g)Z13g*`T~F{h3e+0Rws$xJocCeX_RbHXu8C%q zi3=#x&DRLy4?C~o(f3h!pb3xyN+B|mNS4>mN=~_`=hUYl#~$VO2r%EKq+U`IhxH|t z1d0O{5mHpnOsU8v#!;j@uxmk8YkMgSTy4iop53fQBp;k9As5>Vyt>zj^+?giPkO`L zvfS{?sKj!;xLDk9B_kJ}B^g_;!ab89rb{`LPL_jN3vl6}2vCpES}pLZk-0?nYA;yW z7FSrQ_zgdM)5vr;vfYiGn(*bFirn^0iA*;w z*qMwLwUE&D)oQ&NfY5r;=Xh}|YUr3`?o@s4X<;FU*D9}7@tPpf=?7@55$I4joWi5I zL1hgjzk)}hCE|~VPHBmF1p1z2Nl}k86ctZ$YPF;ugS2wW2Iyf%MH5N;$P<8tH(tjp zEBn+vv}Q`2b{eQv$|r>xvLZ0>8iHg&kD$>ou2lVo3o;jK47`uLKv=cK+t>cb-c}=1 zcC+Zk71`Se4x@R;%F0{))*h>94Q6UYBk6>6Dr9*c$2Y?FDMkW^%aFcGJ%L;qAd**m zEMRnM?h9fW_+19Z(cPosta{u->_gf-v{ZCUtsGMgGntI!#pF3=8B!?6k3+Jfpw@%uNM1ZL)!~cSf_XtX-F!bY1NkO zsTrtUs?oQ01Dd8wAR9T%x01Ei zPjxqHdA7eGFJPiXp6gbv=4rjq3WBxH1VZ;A){3+k1Hv_KCHP0N)UrNL|1KI%hkQcAqYsJ#c+ow1^x`)DN2ZFw|eT8<`MRC)1}k;k^> z$;f9snU@$Y^Bc}a79?d~Bb?Um>g_I?oSM6$pwq#IP(?6#p|YMrouC4(KgeBo6eSTv*7Q;$I8;7hp|KN3eJ8(`Do^qhi0$Z57&JGv{_J>+@ipC00yTXumQjZ z1vUiOPJ!(LY`4Jn0Jc|P`v4mj*nYqc2<#wWhXnQ#V21_vGGMO=>`TCW6OezQ^9G42B2PdR5q}-EI*{R%9$}NdMxl6gH1-M(e-;v55<(`qsUge&Z z%0A_mr7}!^a+Jusj$$g*#(GJD12kcpYjnliES4hA39*MPP7f! z?S^O52H^(V{x5kYhGC_m2-Fnkp)4+6Y@!gMgQ!_4#$E z;33e(>CJti2CK9oc5T;Q(;Z#VXE4DnUu(p&h{1$B!lM&+a{ zVTzmHnT6>DxHpmUTr9Y@C{>r9a zmXVs)A#Qz>!Rbnu^l=QjAGWyJR^ZwQBiSD&R`iEN1VT*TlbkCA5{JP-rnT5s7_^|E7biXNHq`AvTMhT8KTmH-ENLPPDT+l4`&*vH* z{Ua2{=A{I%(g2XluNNLJ-cuaGGf0)j2 z<^;ViJT|;VJZG#1aNFDV(?qB_7qOslQ<5WScqQ)8p}FEPanX@qGGZDRAVDG#Pf)Uz zI-*=vIG{B2MQ(04)L+HMev2mwic=UJw~kmoNv%dbMc<2$+`!LJF*g9019*=1q^O5> z1WvQOgQut?Og_sP?$n$k5nNzef}p^Y3B+O$Zsq{Pw)o4aVO~;JMWbJGNxgDOP$n}g zqv{?nyd?hb-?61`+HPym)M!J3(exYR{u|Vt40b4eNdb8W%@a z&TA|v>YaB`OqUq-G^+FlYob}1>N5mMTm@9fP3K(#g?ewI)`>@&$@a0=X7MVEV<<$N z)LIdgF>TNiKwGI=9J+YF$CIegAyJDZH(s zknceuUs!tpC2|50{80nVsh_}Bf%M3zALET3Y6?R*Dlb`|_YLVoR3qmA*Sn7kFmC{f z!X!h+qBs#kr7qNeM9~(|Z6Y;+@x-<(As=K<*cZo6(_e+7hSB1qb*!})u=m6eY#V)@ z8D5C`HZ}Kn{WY2$L@<$i0gq;E2Wo3>F9T-4rV-M>)(@-|k;?77E8i4hxd3VsX$TZ4 zH8Lju8i5HT+5mAOwKZm&xVICFUc*~$MwkD}eHe1m+Xlsz;T*IFJgeT#@0^rB}dT=mSrI5kXLTSu_MV4s<07oQXa#9dmbJ zI;6~m>8P~yA5e4*p)R&-^Ol51+Hnl~)W|8j!z<6pZ3tBuHw|3-|6n-b7#1TA!y4UG zMsLap>-E&+h~R$-z2bf0d?ruzF2{F7|7OI65nIG`&J>pZ8<%HVPxVr+zl7%iWIAYy z97QvVGMyBte~2O$vtxBk2*851ae}4??jNQHh>K*EVYX^zC-cQH!(gP`L4%ojeT{$_ zgwg5L!@fuf5Yhclgvby}-C!#g43WVBC=_7I#rduFY^w7rCBf z_qT;yC$QOJsFD9ouGE}-M9S{QD~m?TA~gy%3snp=7@2~(%bUpBe#E2s=?-jS_;V1X zQ)*>czSV&=hNP7)XmJbBdyof^!)QUuGVt1vug=9Zx6NdU2JZQd`jt} z`tsH=KByZ)rhG4LY=nxBTLrO*_}vYC-oA`ch5WYJ8DYVcjWfoA7M}SUsp9j?O-b?~ zqfRgX&h43-cW>UFo_w!z>-x;h|9;Nfr!$I#*fHeHR}>3_gcy)S+;4c=C=`m)aJ2 zIJ7fEJo#ZM!5`O6_cMHX{Us#ev|_;P?ADOdgWvC1*8PExmkWfRSSb4`5I8Z xXLe5WP2PmumofcJdWFZ@76=ydnJ|fg%Y)ce@_|IZ)sK7#^C11%LeKD>{{o7gqwN3y literal 0 HcmV?d00001 From 0deb6f90baa5dab02b5ff1cbc98dcaf7fae9b80b Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 30 May 2018 14:20:51 +0800 Subject: [PATCH 10/54] annotation optimized and code style optimized --- python/paddle/fluid/io.py | 22 +++++++++++++++++++++- python/paddle/fluid/trainer.py | 12 ++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 2925e8eb28..d52c9a8823 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -478,9 +478,10 @@ def save_checkpoint(executor, :param executor :param checkpoint_dir + :param trainer_id + :param is_chief :param main_program :param max_num_checkpoints - :param is_chief """ if checkpoint_dir is None: raise ValueError("The values of 'checkpoint_dir' should not be None") @@ -502,6 +503,11 @@ def save_checkpoint(executor, def need_load_checkpoint(checkpoint_dir): + """ + If the directory have checkpoint files, it will return lastest checkpoint directory serial number + + :param checkpoint_dir + """ serial = _get_lastest_checkpoint_dir(checkpoint_dir) if serial < 0: return None @@ -515,6 +521,7 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program): :param executor :param checkpoint_dir + :param serial :param main_program """ @@ -536,7 +543,11 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): """ clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before. delete_dir only works when the directory is empty, otherwise, OSError is raised. + + :param checkpoint_dir + :param delete_dir """ + if checkpoint_dir is None: raise ValueError("The values of 'checkpoint_dir' should not be None") _lru_delete(checkpoint_dir, max_num_checkpoints=0) @@ -549,6 +560,11 @@ def load_persist_vars_without_grad(executor, dirname, program, nest=True): """ load_persist_vars_without_grad will load variables from a directory by an executor, the variable named end with "@GRAD" will not be loaded. + + :param executor + :param dirname + :param program + :param nest """ if nest: @@ -566,6 +582,10 @@ def save_persist_vars_without_grad(executor, dirname, program): """ save_persist_vars_without_grad will save variables to a directory by an executor, the variable named end with "@GRAD" will not be saved. + + :param executor + :param dirname + :param program """ cur_dir = _get_model_dir(dirname) save_vars( diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 34db9b39b7..6d8d4a3e43 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -79,8 +79,8 @@ class CheckpointConfig(object): else: self.step_interval = step_interval - self._epoch_id = 0 - self._step_id = 0 + self.epoch_id = 0 + self.step_id = 0 self._load_serial = None @@ -185,8 +185,8 @@ class Trainer(object): epoch_id, step_id = io.load_trainer_args( self.checkpoint.checkpoint_dir, self.checkpoint._load_serial, self.trainer_id, ["epoch_id", "step_id"]) - self.checkpoint._epoch_id = int(epoch_id) - self.checkpoint._step_id = int(step_id) + self.checkpoint.epoch_id = int(epoch_id) + self.checkpoint.step_id = int(step_id) if param_path and os.path.isdir(param_path): # load params from param_path into scope @@ -353,7 +353,7 @@ class Trainer(object): def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): epochs = [ epoch_id for epoch_id in range(num_epochs) - if epoch_id >= self.checkpoint._epoch_id + if epoch_id >= self.checkpoint.epoch_id ] for epoch_id in epochs: event_handler(BeginEpochEvent(epoch_id)) @@ -363,7 +363,7 @@ class Trainer(object): return if self.checkpoint and self.checkpoint._load_serial \ - and self.checkpoint._step_id >= step_id and self.checkpoint._epoch_id == epoch_id: + and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id: continue begin_event = BeginStepEvent(epoch_id, step_id) From d712af25dcee298a1bd1fda1bba6a1f0ed001ab0 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 30 May 2018 16:29:05 +0800 Subject: [PATCH 11/54] add distribute config --- python/paddle/fluid/trainer.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 6d8d4a3e43..e98672f318 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -81,7 +81,8 @@ class CheckpointConfig(object): self.epoch_id = 0 self.step_id = 0 - self._load_serial = None + self.load_serial = None + self.is_pserver = False def check_and_get_place(place): @@ -145,7 +146,7 @@ class Trainer(object): "The checkpoint_config shoule be an instance of CheckpointConfig" ) else: - self.checkpoint._load_serial = io.need_load_checkpoint( + self.checkpoint.load_serial = io.need_load_checkpoint( self.checkpoint.checkpoint_dir) self.scope = core.Scope() @@ -176,17 +177,18 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if self.checkpoint and self.checkpoint._load_serial: + if self.checkpoint and self.checkpoint.load_serial: exe = executor.Executor(place) io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, - self.checkpoint._load_serial, + self.checkpoint.load_serial, self.startup_program) - epoch_id, step_id = io.load_trainer_args( - self.checkpoint.checkpoint_dir, self.checkpoint._load_serial, - self.trainer_id, ["epoch_id", "step_id"]) - self.checkpoint.epoch_id = int(epoch_id) - self.checkpoint.step_id = int(step_id) + if not self.checkpoint.is_pserver: + epoch_id, step_id = io.load_trainer_args( + self.checkpoint.checkpoint_dir, self.checkpoint.load_serial, + self.trainer_id, ["epoch_id", "step_id"]) + self.checkpoint.epoch_id = int(epoch_id) + self.checkpoint.step_id = int(step_id) if param_path and os.path.isdir(param_path): # load params from param_path into scope @@ -259,6 +261,9 @@ class Trainer(object): t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": + if self.checkpoint: + self.is_pserver = True + self.train_program = t.get_pserver_program(current_endpoint) self.startup_program = t.get_startup_program(current_endpoint, self.train_program) @@ -362,7 +367,7 @@ class Trainer(object): self._clean_checkpoint() return - if self.checkpoint and self.checkpoint._load_serial \ + if self.checkpoint and self.checkpoint.load_serial \ and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id: continue From b44ede803387c0e292322ba140468599a9136352 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 30 May 2018 19:26:12 +0800 Subject: [PATCH 12/54] bug fix --- python/paddle/fluid/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index e98672f318..b4f719855f 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -252,14 +252,14 @@ class Trainer(object): current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port # the unique trainer id, starting from 0, needed by trainer # only - trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) self.chief = self.trainer_id == 0 # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") with self._prog_and_scope_guard(): t = distribute_transpiler.DistributeTranspiler() t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) + self.trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": if self.checkpoint: self.is_pserver = True From 94eaf94cf57ec2cc951d046e847b69c348b8f9c9 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 30 May 2018 21:51:16 +0800 Subject: [PATCH 13/54] bug fix about lru and save --- python/paddle/fluid/io.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index d52c9a8823..8e10b01a4a 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -495,11 +495,11 @@ def save_checkpoint(executor, serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1 cur_dir = _get_serial_dir(checkpoint_dir, serial) + save_trainer_args(cur_dir, trainer_id, trainer_args) + if is_chief: save_persist_vars_without_grad(executor, cur_dir, main_program) - - save_trainer_args(cur_dir, trainer_id, trainer_args) - _lru_delete(checkpoint_dir, max_num_checkpoints) + _lru_delete(checkpoint_dir, max_num_checkpoints) def need_load_checkpoint(checkpoint_dir): @@ -639,7 +639,13 @@ def _is_checkpoint_var(var): var.desc.type() == core.VarDesc.VarType.RAW: return False - if var.name.endswith("@GRAD"): + if "@GRAD" in var.name: + return False + + if ".trainer_" in var.name: + return False + + if ".block" in var.name: return False return var.persistable From e44c278e60603c37640a0a352f4bbb7f8363bebc Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 30 May 2018 21:55:12 +0800 Subject: [PATCH 14/54] bug fix about clean --- python/paddle/fluid/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index b4f719855f..69577a98fb 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -425,7 +425,7 @@ class Trainer(object): return self._get_parallel_executor() def _clean_checkpoint(self): - if not self.checkpoint: + if not self.checkpoint and not self.chief: return io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir) From bca4da422582990b4308932d2c20274cdb6c5a60 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 30 May 2018 21:56:54 +0800 Subject: [PATCH 15/54] cancle only chief delete files --- python/paddle/fluid/io.py | 3 ++- python/paddle/fluid/trainer.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 8e10b01a4a..62e3046db6 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -499,7 +499,8 @@ def save_checkpoint(executor, if is_chief: save_persist_vars_without_grad(executor, cur_dir, main_program) - _lru_delete(checkpoint_dir, max_num_checkpoints) + + _lru_delete(checkpoint_dir, max_num_checkpoints) def need_load_checkpoint(checkpoint_dir): diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 69577a98fb..b4f719855f 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -425,7 +425,7 @@ class Trainer(object): return self._get_parallel_executor() def _clean_checkpoint(self): - if not self.checkpoint and not self.chief: + if not self.checkpoint: return io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir) From 46f2688f3051b0bbeb070d05159922e8b689720e Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 31 May 2018 09:53:41 +0800 Subject: [PATCH 16/54] bug fix --- python/paddle/fluid/trainer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index b4f719855f..3354d77ace 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -356,10 +356,14 @@ class Trainer(object): self._train_by_any_executor(event_handler, exe, num_epochs, reader) def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): - epochs = [ - epoch_id for epoch_id in range(num_epochs) - if epoch_id >= self.checkpoint.epoch_id - ] + if self.checkpoint: + epochs = [ + epoch_id for epoch_id in range(num_epochs) + if epoch_id >= self.checkpoint.epoch_id + ] + else: + epochs = [epoch_id for epoch_id in range(num_epochs)] + for epoch_id in epochs: event_handler(BeginEpochEvent(epoch_id)) for step_id, data in enumerate(reader()): From f9d93bfde1bd69d84a10cb676f0aba52b1596edd Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Thu, 31 May 2018 16:42:00 +0800 Subject: [PATCH 17/54] Add document to random crop operator --- paddle/fluid/operators/random_crop_op.cc | 6 +-- .../fluid/layers/layer_function_generator.py | 53 ++++++++++++++++--- python/paddle/fluid/layers/nn.py | 29 +++++++++- 3 files changed, 75 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index b14b559e31..371cdb5b85 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -36,11 +36,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Seed", "The random seed."); AddOutput("Out", "The cropped instance batch."); AddOutput("SeedOut", "The random seed after random cropping.") - .AsDispensable(); + .AsIntermediate(); AddAttr>("shape", "The shape of a cropped instance."); AddComment(R"DOC( - This operator takes a batch of instance, and do random cropping on each instance. - It means that cropping positions differs on each instance, which is determined + This operator takes a batch of instance, and do random cropping on each instance. + It means that cropping positions differs on each instance, which is determined by an uniform random generator. All cropped instances have the same shape, which is determined by the operator's attribute 'shape'. )DOC"); diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 295d1b7190..6026237d0b 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -15,16 +15,13 @@ import re import cStringIO import functools import warnings +import string from ..proto import framework_pb2 from ..framework import OpProtoHolder, Variable from ..layer_helper import LayerHelper -__all__ = [ - 'deprecated', - 'generate_layer_fn', - 'autodoc', -] +__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc'] def _convert_(name): @@ -43,6 +40,10 @@ def _convert_(name): return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() +def _type_to_str_(tp): + return framework_pb2.AttrType.Name(tp) + + def _generate_doc_string_(op_proto): """ Generate docstring by OpProto @@ -54,9 +55,6 @@ def _generate_doc_string_(op_proto): str: the document string """ - def _type_to_str_(tp): - return framework_pb2.AttrType.Name(tp) - if not isinstance(op_proto, framework_pb2.OpProto): raise TypeError("OpProto should be `framework_pb2.OpProto`") @@ -220,3 +218,42 @@ def autodoc(comment=""): return func return __impl__ + + +def templatedoc(): + """ + Decorator of layer function. It will use the docstring from the layer + function as the template. The template arguments are: + + * ${comment}: The operator comment written in CPP. + * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput, + and AddInput. The ${name} is Python snake style. i.e., xxx_xxx. + * ${{name}_type}: The type of ${name}. + + + Returns: + Decorated funciton. + """ + + def __impl__(func): + op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) + tmpl = string.Template(func.__doc__) + args = {"comment": " ".join(op_proto.comment.split())} + for each_input in op_proto.inputs: + input_name = _convert_(each_input.name) + args["{0}_comment".format(input_name)] = each_input.comment + args["{0}_type".format(input_name)] = "Variable" + for each_attr in op_proto.attrs: + input_name = _convert_(each_attr.name) + args["{0}_comment".format(input_name)] = each_attr.comment + args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type) + + for each_opt in op_proto.outputs: + output_name = _convert_(each_opt.name) + args["{0}_comment".format(output_name)] = each_opt.comment + args["{0}_type".format(output_name)] = "Variable" + + func.__doc__ = tmpl.substitute(args) + return func + + return __impl__ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 63ec831514..acebeaebbb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -19,9 +19,10 @@ from ..layer_helper import LayerHelper from ..initializer import Normal, Constant from ..framework import Variable from ..param_attr import ParamAttr -from layer_function_generator import autodoc +from layer_function_generator import autodoc, templatedoc from tensor import concat import utils +import random __all__ = [ 'fc', @@ -3992,10 +3993,34 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None): return out -def random_crop(input, shape, seed=1): +@templatedoc() +def random_crop(x, shape, seed=None): + """ + **Random crop operator** + + ${comment} + + Examples: + >>> img = fluid.layers.data("img", [3, 256, 256]) + >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224]) + + Args: + x(${x_type}): ${x_comment} + shape(${shape_type}): ${shape_comment} + seed(int|${seed_type}|None): ${seed_comment} By default, the seed will + get from `random.randint(-65536, 65535)`. + + Returns: + ${out_comment} + + """ + helper = LayerHelper("random_crop", **locals()) dtype = helper.input_dtype() out = helper.create_tmp_variable(dtype) + if seed is None: + seed = random.randint(-65536, 65535) + if isinstance(seed, int): seed_value = seed seed = helper.create_tmp_variable(dtype="int64") From 7c00e164e5886bb430ff945f2de091a2d45ff811 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Thu, 31 May 2018 17:07:42 +0800 Subject: [PATCH 18/54] Add More comments --- .../fluid/layers/layer_function_generator.py | 13 ++++++--- python/paddle/fluid/layers/nn.py | 27 +++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 6026237d0b..fb5e454e94 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -230,15 +230,22 @@ def templatedoc(): and AddInput. The ${name} is Python snake style. i.e., xxx_xxx. * ${{name}_type}: The type of ${name}. - Returns: - Decorated funciton. + Decorated function. """ def __impl__(func): op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) tmpl = string.Template(func.__doc__) - args = {"comment": " ".join(op_proto.comment.split())} + + comment_lines = op_proto.comment.split("\n") + comment = "" + for line in comment_lines: + line = line.lstrip() + comment += line + comment += "\n" + + args = {"comment": comment} for each_input in op_proto.inputs: input_name = _convert_(each_input.name) args["{0}_comment".format(input_name)] = each_input.comment diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index acebeaebbb..970a186fac 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -799,7 +799,22 @@ def gru_unit(input, return updated_hidden, reset_hidden_pre, gate +@templatedoc() def linear_chain_crf(input, label, param_attr=None): + """ + Linear Chain CRF. + + ${comment} + + Args: + input(${emission_type}): ${emission_comment} + label(${label_type}): ${label_comment} + param_attr(ParamAttr): The attribute of the learnable parameter. + + Returns: + ${log_likelihood_comment} + + """ helper = LayerHelper('linear_chain_crf', **locals()) size = input.shape[1] transition = helper.create_parameter( @@ -825,7 +840,19 @@ def linear_chain_crf(input, label, param_attr=None): return log_likelihood +@templatedoc() def crf_decoding(input, param_attr, label=None): + """ + ${comment} + + Args: + input(${emission_type}): ${emission_comment} + param_attr(ParamAttr): The parameter attribute for training. + label(${label_type}): ${label_comment} + + Returns: + ${viterbi_path_comment} + """ helper = LayerHelper('crf_decoding', **locals()) transition = helper.get_parameter(param_attr.name) viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype()) From 018d411075d40070b1efbae0f86185a819d35586 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Thu, 31 May 2018 17:09:26 +0800 Subject: [PATCH 19/54] Remove unnecessary func name --- python/paddle/fluid/layers/nn.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 970a186fac..5e139a2653 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4023,8 +4023,6 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None): @templatedoc() def random_crop(x, shape, seed=None): """ - **Random crop operator** - ${comment} Examples: From 7973d9b4b5b3ef032c13410401b8c368220cd21d Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 1 Jun 2018 10:09:31 +0800 Subject: [PATCH 20/54] bug fix --- python/paddle/fluid/trainer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 3354d77ace..72168886fd 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -178,10 +178,11 @@ class Trainer(object): exe.run(self.startup_program) if self.checkpoint and self.checkpoint.load_serial: - exe = executor.Executor(place) - io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, - self.checkpoint.load_serial, - self.startup_program) + with self._prog_and_scope_guard(): + exe = executor.Executor(place) + io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, + self.checkpoint.load_serial, + self.startup_program) if not self.checkpoint.is_pserver: epoch_id, step_id = io.load_trainer_args( From c06f43bbb6aec4ae12d514ca92a77aed0d473882 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 4 Jun 2018 15:20:06 +0800 Subject: [PATCH 21/54] add annotation about _is_checkpoint_var --- python/paddle/fluid/io.py | 5 +++-- tools/codestyle/docstring_checker.pyc | Bin 12561 -> 12561 bytes 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 62e3046db6..75146fe326 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -639,13 +639,14 @@ def _is_checkpoint_var(var): var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.RAW: return False - + # @GRAD are named for gradient varibales, checkpoint will not save it. if "@GRAD" in var.name: return False - + # .trainer_ are named for distribute trian variables, checkpoint will not save it. if ".trainer_" in var.name: return False + # .block is named for distribute trian variables, checkpoint will not save it. if ".block" in var.name: return False diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc index 1f0255b763c8d154b7cc17d2d525148dfea3b42d..a27d3c9a8cccab8552d510578debb2df04eb53bb 100644 GIT binary patch delta 16 XcmbQ3G%<;t`7|BNbF9rm> delta 16 XcmbQ3G%<;t`7|BNbF*pSi From 08e5f0ae482c1e70dc74c4677e5cb699b38c433e Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 4 Jun 2018 16:10:11 +0800 Subject: [PATCH 22/54] rename need_load_checkpoint to get_latest_checkpoint_serial --- python/paddle/fluid/io.py | 4 ++-- python/paddle/fluid/trainer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 75146fe326..111907b575 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -25,7 +25,7 @@ __all__ = [ 'load_persistables', 'save_inference_model', 'load_inference_model', 'get_inference_program', 'save_checkpoint', 'load_checkpoint', 'clean_checkpoint', 'load_persist_vars_without_grad', - 'save_persist_vars_without_grad' + 'save_persist_vars_without_grad', 'get_latest_checkpoint_serial' ] @@ -503,7 +503,7 @@ def save_checkpoint(executor, _lru_delete(checkpoint_dir, max_num_checkpoints) -def need_load_checkpoint(checkpoint_dir): +def get_latest_checkpoint_serial(checkpoint_dir): """ If the directory have checkpoint files, it will return lastest checkpoint directory serial number diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 72168886fd..3c32ec1de8 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -146,7 +146,7 @@ class Trainer(object): "The checkpoint_config shoule be an instance of CheckpointConfig" ) else: - self.checkpoint.load_serial = io.need_load_checkpoint( + self.checkpoint.load_serial = io.get_latest_checkpoint_serial( self.checkpoint.checkpoint_dir) self.scope = core.Scope() From bfdcf18707c79f2cc29b0903cb9f4fab2e907490 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 4 Jun 2018 21:10:38 +0800 Subject: [PATCH 23/54] grammar optimized. --- python/paddle/fluid/io.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 111907b575..b5d96441bc 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -492,7 +492,7 @@ def save_checkpoint(executor, if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) - serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1 + serial = _get_latest_checkpoint_dir(checkpoint_dir) + 1 cur_dir = _get_serial_dir(checkpoint_dir, serial) save_trainer_args(cur_dir, trainer_id, trainer_args) @@ -505,11 +505,11 @@ def save_checkpoint(executor, def get_latest_checkpoint_serial(checkpoint_dir): """ - If the directory have checkpoint files, it will return lastest checkpoint directory serial number + If the directory have checkpoint files, it will return latest checkpoint directory serial number :param checkpoint_dir """ - serial = _get_lastest_checkpoint_dir(checkpoint_dir) + serial = _get_latest_checkpoint_dir(checkpoint_dir) if serial < 0: return None return serial @@ -639,14 +639,14 @@ def _is_checkpoint_var(var): var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.RAW: return False - # @GRAD are named for gradient varibales, checkpoint will not save it. + # @GRAD are named for gradient variables, checkpoint will not save it. if "@GRAD" in var.name: return False - # .trainer_ are named for distribute trian variables, checkpoint will not save it. + # .trainer_ are named for distribute train variables, checkpoint will not save it. if ".trainer_" in var.name: return False - # .block is named for distribute trian variables, checkpoint will not save it. + # .block is named for distribute train variables, checkpoint will not save it. if ".block" in var.name: return False @@ -656,7 +656,6 @@ def _is_checkpoint_var(var): def _get_dir_serial(dirname): _, serial = dirname.split(CHECKPOINT_SEPARATOR) - serial_num = -1 try: serial_num = int(serial) except ValueError: @@ -723,7 +722,7 @@ def _write_success(dirname): f.write(now) -def _get_lastest_checkpoint_dir(checkpoint_dir): +def _get_latest_checkpoint_dir(checkpoint_dir): """ get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory From 9735f25011b04116d271861fde8df05def81c3ce Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 14:47:13 +0800 Subject: [PATCH 24/54] optimized --- python/paddle/fluid/io.py | 44 +++++++++++++--------------------- python/paddle/fluid/trainer.py | 8 +++---- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index b5d96441bc..5abadc73f7 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -492,7 +492,7 @@ def save_checkpoint(executor, if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) - serial = _get_latest_checkpoint_dir(checkpoint_dir) + 1 + serial = get_latest_checkpoint_serial(checkpoint_dir) + 1 cur_dir = _get_serial_dir(checkpoint_dir, serial) save_trainer_args(cur_dir, trainer_id, trainer_args) @@ -503,18 +503,6 @@ def save_checkpoint(executor, _lru_delete(checkpoint_dir, max_num_checkpoints) -def get_latest_checkpoint_serial(checkpoint_dir): - """ - If the directory have checkpoint files, it will return latest checkpoint directory serial number - - :param checkpoint_dir - """ - serial = _get_latest_checkpoint_dir(checkpoint_dir) - if serial < 0: - return None - return serial - - def load_checkpoint(executor, checkpoint_dir, serial, main_program): """ Load checkpoint from a directory by executor, @@ -527,17 +515,16 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program): """ if checkpoint_dir is None: - raise ValueError( - "The values of 'checkpoint_dir' or 'serial' should not be None") + raise ValueError("The values of 'checkpoint_dir' should not be None") if serial is None or serial < 0: raise ValueError("The values of 'serial' should not be None or <0 ") if main_program is None: - raise ValueError("The values of 'main_program'should not be None") + raise ValueError('main_program should not be None.') cur_dir = _get_serial_dir(checkpoint_dir, serial) - load_persist_vars_without_grad(executor, cur_dir, main_program) + load_persist_vars_without_grad(executor, cur_dir, main_program, True) def clean_checkpoint(checkpoint_dir, delete_dir=False): @@ -557,18 +544,21 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): os.rmdir(checkpoint_dir) -def load_persist_vars_without_grad(executor, dirname, program, nest=True): +def load_persist_vars_without_grad(executor, + dirname, + program, + has_model_dir=False): """ load_persist_vars_without_grad will load variables from a directory by an executor, the variable named end with "@GRAD" will not be loaded. - :param executor - :param dirname - :param program - :param nest + :param executor executor for load the value + :param dirname the checkpoint directory + :param program will load all variables in program + :param has_model_dir if has_model_dir is True, will load variables from sub directory named __model__ """ - if nest: + if has_model_dir: dirname = _get_model_dir(dirname) load_vars( @@ -584,9 +574,9 @@ def save_persist_vars_without_grad(executor, dirname, program): save_persist_vars_without_grad will save variables to a directory by an executor, the variable named end with "@GRAD" will not be saved. - :param executor - :param dirname - :param program + :param executor executor for load the value + :param dirname the checkpoint directory + :param program will load all variables in program """ cur_dir = _get_model_dir(dirname) save_vars( @@ -722,7 +712,7 @@ def _write_success(dirname): f.write(now) -def _get_latest_checkpoint_dir(checkpoint_dir): +def get_latest_checkpoint_serial(checkpoint_dir): """ get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 3c32ec1de8..fbdd28f53e 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -146,8 +146,9 @@ class Trainer(object): "The checkpoint_config shoule be an instance of CheckpointConfig" ) else: - self.checkpoint.load_serial = io.get_latest_checkpoint_serial( + serial = io.get_latest_checkpoint_serial( self.checkpoint.checkpoint_dir) + self.checkpoint.load_serial = serial if serial >= 0 else None self.scope = core.Scope() @@ -194,10 +195,7 @@ class Trainer(object): if param_path and os.path.isdir(param_path): # load params from param_path into scope io.load_persist_vars_without_grad( - exe, - dirname=param_path, - program=self.startup_program, - nest=False) + exe, dirname=param_path, program=self.startup_program) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS From be16af3b04b3052e35e6d9157cec302274a629a4 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 14:48:15 +0800 Subject: [PATCH 25/54] delete pyc --- tools/codestyle/docstring_checker.pyc | Bin 12561 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tools/codestyle/docstring_checker.pyc diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc deleted file mode 100644 index a27d3c9a8cccab8552d510578debb2df04eb53bb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12561 zcmdT~%WoUU8J{I7N~9&rPx%oi-uOw^rX)Y&B&yDAw|-TX1?xj-N!7Warm=5bYj{N50^(C+V(2Hze+imT+b{yJug6!5>yc9NV zH>}%c|-bj7cn3o>=@Nnkurg-h%;-x9IB_yO%9{kseL9wF{k5 z(pgP9V*)px+w3Nfp1^NnrV?sO3cZ5b*n1kYj>Wa+;zyo8cGvtrf8pwU&3D_Q^{_Fv z=*MG=;h2uz%QP)JX z%ESef>E`Q%@rRw)@aX#}JkSJ40i_TbNhHf_XClMO0$BUR*40xSEj*&ytKSSK*#X5HqD5N+-*~tOdAmPz0z)XssT2wa8o|d$kuV zY>O+bR{f?QSF6XlDeg>J!&Zs|p5{7xrcf+)2pzON*==Nc8`<7QPEGpqPE~HZQXjap;KBS9)Z5cSyI%a3`ND`oLViZM`P=n*s;#+9nybV24qje+-}7YM7idHdS`*xPDk z%5E0DxGH-a#bGq>SXp^%(AsMit(}=V(MUQWoeEiA!10aneS(p|;WDJJQjZ~5hKS_V zJ_{I~p8K3w27Z@;adh{oIIAA@5&Muf4=okFt5%MyrkPAe@?!EFvkWPe^VE(QB;IM&)FRgwsrajM^2|QyuX|cAw1Z%+JAu%Bh_xau#(?mQCP&H^ot;DkHCbmRhq_mb zERy3*IJ*g;QtO;^!Xf$$_%o>A869#(2w<90a%*v?Tr;#NWuW(v7N|6rkTvXJGk!z-=hkssPg*YN0zxHRA? z^}4()-EKNt#?6Ai4<9Q_haScby(>8T1?8ieS00-E>Hu8#1<+p#2{K?(QJtM$9%Kes9_A2+RRQ4(N zoK*HJw=9(r`jcZs-VGGfp*Ge_5*(lj(^{h|-f`&>kPwSUoOl{1W_%KgkSKlA)@2n>*M3+H92UyyOJAv8rEKH!=kZi$k@k5kJ&9R@Fw6Yfd9!*1yhYKp(I zsh4G>rgey0-(+yQ(q(-DgYJiIZnhn`Ho{2u2Z>N$5x(l1! z>%5GcsP(*^xFWr?qzk_)czkMMF*82sI5E9%iWh0_GLlh(qs^B8;yKdQUlSKJi~aMt zjz|9hg|RuQ%MtlAUMS0qItcmt5U%5Cn&uV4a!Oy42IhPso+>pwd^_or_u;F-!Gi$+ zXElTQ#CKcYhA)}+OT3iZ(VwFD(1%E=F0`BC%^`AW~Tm51wQ0p`mCnuP&qm_3#v-d<`Ufnx6*&oh<9({{~(t+4=N+Wb%5^3MR&`%tKXq0ESt%M|3<1&3*uB}7XLkB<;7=U7LyZa0ic z`_6d{g>h$zC+7;Ac(yd+%$=Bn8#C@#Xm|Xj&gs6{C#2djCzPl)xcApzc1B*(x?5xe zav_mJXcz~BLJ19oh)hqZR?D22G7NBp6MPv;_j^2v3LO^ZIhYx;jzE1Lu?9@)fy#xa&T`}$3PZRc;h(_U zDhl}?67q$$2T&p>5Wyca(46`)Y!yh4jQSDY*rBE|groA3^?A>bK14Ng4sg8(xB&A8 zkSI(tWGspkAyn!@{YMlX0o@@|6BtiyyAtw2_N0Aj{0#k7IBFOzK3dmWdk%X~48e}k z*PY?HsBcqqkJn$P*+B#oxfk(h#&)2#=JqjQ25cH34QzeSS`n$-$-DAR5ta*}CXt3f zky4}M@~;_~Frp0*7gAeewuyT?x#%^$)mC)nuiS?rC(Z2_r5qAaamfE1$pnWUa4$Cs z)jd7f=ntENyIi^#O-Tmmc4H%Va#@Mw$>f5PE{3)SInH(L=}&kP{wpBHOqh;JOaB2y*AVJqyAE$jc%&1@uuqSkw!6IYjNFD$g>loswf_%>6OLgq>M*R) zOJ($?jIdr$O^yiu7tkx-7tUw$RBv;9SM+a2To|!MOy^8v>A!J#ru9@m<@yVF4nn4b zrpQq=qbSo!f%^L>Vlg{b$AthaSQ{s4df@(HdVsh{RvBiiR(3I83^NQy${jSAnb+3| zs343^ryll2N`Q#&e zZo;9w`8H!F_`DEFGO-3=q;s?UVh1+C4pmvb$+f2E-jpwe&tE+{{)gYL@Ilj2=U9FY{SOn#q;{1Uv6bQ6Z51?er5457O9WZD^0oWdh_kY ziR9ZJ99vJ=&PBkxj3uTb!pBG=m6+5y1B)q!i)@$Z8%feyqEyV)*O&ajjl5J6$P@8n!7>8nbH&Hg6BL@@;HxoP0{@ zq6YHT2tKGAMy7lpZES>!Pgn)9i1^(NecpkLQHA`r*%@WQl#Mgaf)<|n8ma2@%S}o0 zAfrw%|IY2ot-H5w&rH2jefLJCGIjGt_13h67r%Ss{=M7IbPw3M&xvSy>9xju7>HTf zLMAZs<2AXDhQ>Fdlp0}6zD#r8MBTZ{Vu}UBpUyQFbl=5?5QER58g(d~9G<+Z)upyY z9uDoy5Rd*2ihMS26`<0o+WDKs=Z!^t(NsjOU&`I6@iDD_YlxZt1Cry<7Yc(Tg^|ME z!r{VU`IJvC#<+X_s!=Y(SjFV0M>{EDsvqTm43wl0l3w|kd;uvzZElvIP}RdANWKP? y@0p!5e3LgJ_hn2!lV0JmwgrNPd?rm|;L1+yD)~TS&>BQOgn5v`Y@u)D&VK>kVxyJ- From eea5762e26a9a6ae2d9642830031028e5952af45 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 17:04:17 +0800 Subject: [PATCH 26/54] add checkpoint unittest --- .../fluid/tests/unittests/test_checkpoint.py | 72 ++++++++++++++++++ tools/codestyle/docstring_checker.pyc | Bin 0 -> 12561 bytes 2 files changed, 72 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_checkpoint.py create mode 100644 tools/codestyle/docstring_checker.pyc diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py new file mode 100644 index 0000000000..b8d82c59b4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py @@ -0,0 +1,72 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import unittest + + +class TestCheckpoint(unittest.TestCase): + def setUp(self): + self.dirname = "/tmp/ckpt" + self.max_num_checkpoints = 3 + self.epoch_interval = 1 + self.step_interval = 1 + self.trainer_id = 0 + self.chief = self.trainer_id == 0 + self.place = fluid.CPUPlace() + self.epoch_id = 100 + self.step_id = 20 + + def test_checkpoint(self): + self.save_checkpoint() + serial = fluid.io.get_latest_checkpoint_serial(self.dirname) + self.assertTrue(serial >= 0) + trainer_args = ["epoch_id", "step_id"] + epoch_id, step_id = fluid.io.load_trainer_args( + self.dirname, serial, self.trainer_id, trainer_args) + self.assertEqual(self.step_id, step_id) + self.assertEqual(self.epoch_id, epoch_id) + + program = fluid.Program() + with fluid.program_guard(program): + exe = fluid.Executor(self.place) + fluid.io.load_checkpoint(exe, self.dirname, serial, program) + + fluid.io.clean_checkpoint(self.dirname, delete_dir=True) + + def save_checkpoint(self): + config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints, + self.epoch_interval, self.step_interval) + + trainer_args = {} + trainer_args["epoch_id"] = self.epoch_id + trainer_args["step_id"] = self.step_id + + program = fluid.Program() + with fluid.program_guard(program): + program.global_block().create_var( + name="scale_0", + psersistable=True, + dtype="float32", + shape=[32, 32]) + + exe = fluid.Executor(self.place) + for i in xrange(10): + fluid.io.save_checkpoint( + exe, config.checkpoint_dir, self.trainer_id, self.chief, + trainer_args, program, config.max_num_checkpoints) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a27d3c9a8cccab8552d510578debb2df04eb53bb GIT binary patch literal 12561 zcmdT~%WoUU8J{I7N~9&rPx%oi-uOw^rX)Y&B&yDAw|-TX1?xj-N!7Warm=5bYj{N50^(C+V(2Hze+imT+b{yJug6!5>yc9NV zH>}%c|-bj7cn3o>=@Nnkurg-h%;-x9IB_yO%9{kseL9wF{k5 z(pgP9V*)px+w3Nfp1^NnrV?sO3cZ5b*n1kYj>Wa+;zyo8cGvtrf8pwU&3D_Q^{_Fv z=*MG=;h2uz%QP)JX z%ESef>E`Q%@rRw)@aX#}JkSJ40i_TbNhHf_XClMO0$BUR*40xSEj*&ytKSSK*#X5HqD5N+-*~tOdAmPz0z)XssT2wa8o|d$kuV zY>O+bR{f?QSF6XlDeg>J!&Zs|p5{7xrcf+)2pzON*==Nc8`<7QPEGpqPE~HZQXjap;KBS9)Z5cSyI%a3`ND`oLViZM`P=n*s;#+9nybV24qje+-}7YM7idHdS`*xPDk z%5E0DxGH-a#bGq>SXp^%(AsMit(}=V(MUQWoeEiA!10aneS(p|;WDJJQjZ~5hKS_V zJ_{I~p8K3w27Z@;adh{oIIAA@5&Muf4=okFt5%MyrkPAe@?!EFvkWPe^VE(QB;IM&)FRgwsrajM^2|QyuX|cAw1Z%+JAu%Bh_xau#(?mQCP&H^ot;DkHCbmRhq_mb zERy3*IJ*g;QtO;^!Xf$$_%o>A869#(2w<90a%*v?Tr;#NWuW(v7N|6rkTvXJGk!z-=hkssPg*YN0zxHRA? z^}4()-EKNt#?6Ai4<9Q_haScby(>8T1?8ieS00-E>Hu8#1<+p#2{K?(QJtM$9%Kes9_A2+RRQ4(N zoK*HJw=9(r`jcZs-VGGfp*Ge_5*(lj(^{h|-f`&>kPwSUoOl{1W_%KgkSKlA)@2n>*M3+H92UyyOJAv8rEKH!=kZi$k@k5kJ&9R@Fw6Yfd9!*1yhYKp(I zsh4G>rgey0-(+yQ(q(-DgYJiIZnhn`Ho{2u2Z>N$5x(l1! z>%5GcsP(*^xFWr?qzk_)czkMMF*82sI5E9%iWh0_GLlh(qs^B8;yKdQUlSKJi~aMt zjz|9hg|RuQ%MtlAUMS0qItcmt5U%5Cn&uV4a!Oy42IhPso+>pwd^_or_u;F-!Gi$+ zXElTQ#CKcYhA)}+OT3iZ(VwFD(1%E=F0`BC%^`AW~Tm51wQ0p`mCnuP&qm_3#v-d<`Ufnx6*&oh<9({{~(t+4=N+Wb%5^3MR&`%tKXq0ESt%M|3<1&3*uB}7XLkB<;7=U7LyZa0ic z`_6d{g>h$zC+7;Ac(yd+%$=Bn8#C@#Xm|Xj&gs6{C#2djCzPl)xcApzc1B*(x?5xe zav_mJXcz~BLJ19oh)hqZR?D22G7NBp6MPv;_j^2v3LO^ZIhYx;jzE1Lu?9@)fy#xa&T`}$3PZRc;h(_U zDhl}?67q$$2T&p>5Wyca(46`)Y!yh4jQSDY*rBE|groA3^?A>bK14Ng4sg8(xB&A8 zkSI(tWGspkAyn!@{YMlX0o@@|6BtiyyAtw2_N0Aj{0#k7IBFOzK3dmWdk%X~48e}k z*PY?HsBcqqkJn$P*+B#oxfk(h#&)2#=JqjQ25cH34QzeSS`n$-$-DAR5ta*}CXt3f zky4}M@~;_~Frp0*7gAeewuyT?x#%^$)mC)nuiS?rC(Z2_r5qAaamfE1$pnWUa4$Cs z)jd7f=ntENyIi^#O-Tmmc4H%Va#@Mw$>f5PE{3)SInH(L=}&kP{wpBHOqh;JOaB2y*AVJqyAE$jc%&1@uuqSkw!6IYjNFD$g>loswf_%>6OLgq>M*R) zOJ($?jIdr$O^yiu7tkx-7tUw$RBv;9SM+a2To|!MOy^8v>A!J#ru9@m<@yVF4nn4b zrpQq=qbSo!f%^L>Vlg{b$AthaSQ{s4df@(HdVsh{RvBiiR(3I83^NQy${jSAnb+3| zs343^ryll2N`Q#&e zZo;9w`8H!F_`DEFGO-3=q;s?UVh1+C4pmvb$+f2E-jpwe&tE+{{)gYL@Ilj2=U9FY{SOn#q;{1Uv6bQ6Z51?er5457O9WZD^0oWdh_kY ziR9ZJ99vJ=&PBkxj3uTb!pBG=m6+5y1B)q!i)@$Z8%feyqEyV)*O&ajjl5J6$P@8n!7>8nbH&Hg6BL@@;HxoP0{@ zq6YHT2tKGAMy7lpZES>!Pgn)9i1^(NecpkLQHA`r*%@WQl#Mgaf)<|n8ma2@%S}o0 zAfrw%|IY2ot-H5w&rH2jefLJCGIjGt_13h67r%Ss{=M7IbPw3M&xvSy>9xju7>HTf zLMAZs<2AXDhQ>Fdlp0}6zD#r8MBTZ{Vu}UBpUyQFbl=5?5QER58g(d~9G<+Z)upyY z9uDoy5Rd*2ihMS26`<0o+WDKs=Z!^t(NsjOU&`I6@iDD_YlxZt1Cry<7Yc(Tg^|ME z!r{VU`IJvC#<+X_s!=Y(SjFV0M>{EDsvqTm43wl0l3w|kd;uvzZElvIP}RdANWKP? y@0p!5e3LgJ_hn2!lV0JmwgrNPd?rm|;L1+yD)~TS&>BQOgn5v`Y@u)D&VK>kVxyJ- literal 0 HcmV?d00001 From 951fa7441c7eff3596735ac55dda01288870aab6 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 17:04:45 +0800 Subject: [PATCH 27/54] add checkpoint unittest --- tools/codestyle/docstring_checker.pyc | Bin 12561 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tools/codestyle/docstring_checker.pyc diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc deleted file mode 100644 index a27d3c9a8cccab8552d510578debb2df04eb53bb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12561 zcmdT~%WoUU8J{I7N~9&rPx%oi-uOw^rX)Y&B&yDAw|-TX1?xj-N!7Warm=5bYj{N50^(C+V(2Hze+imT+b{yJug6!5>yc9NV zH>}%c|-bj7cn3o>=@Nnkurg-h%;-x9IB_yO%9{kseL9wF{k5 z(pgP9V*)px+w3Nfp1^NnrV?sO3cZ5b*n1kYj>Wa+;zyo8cGvtrf8pwU&3D_Q^{_Fv z=*MG=;h2uz%QP)JX z%ESef>E`Q%@rRw)@aX#}JkSJ40i_TbNhHf_XClMO0$BUR*40xSEj*&ytKSSK*#X5HqD5N+-*~tOdAmPz0z)XssT2wa8o|d$kuV zY>O+bR{f?QSF6XlDeg>J!&Zs|p5{7xrcf+)2pzON*==Nc8`<7QPEGpqPE~HZQXjap;KBS9)Z5cSyI%a3`ND`oLViZM`P=n*s;#+9nybV24qje+-}7YM7idHdS`*xPDk z%5E0DxGH-a#bGq>SXp^%(AsMit(}=V(MUQWoeEiA!10aneS(p|;WDJJQjZ~5hKS_V zJ_{I~p8K3w27Z@;adh{oIIAA@5&Muf4=okFt5%MyrkPAe@?!EFvkWPe^VE(QB;IM&)FRgwsrajM^2|QyuX|cAw1Z%+JAu%Bh_xau#(?mQCP&H^ot;DkHCbmRhq_mb zERy3*IJ*g;QtO;^!Xf$$_%o>A869#(2w<90a%*v?Tr;#NWuW(v7N|6rkTvXJGk!z-=hkssPg*YN0zxHRA? z^}4()-EKNt#?6Ai4<9Q_haScby(>8T1?8ieS00-E>Hu8#1<+p#2{K?(QJtM$9%Kes9_A2+RRQ4(N zoK*HJw=9(r`jcZs-VGGfp*Ge_5*(lj(^{h|-f`&>kPwSUoOl{1W_%KgkSKlA)@2n>*M3+H92UyyOJAv8rEKH!=kZi$k@k5kJ&9R@Fw6Yfd9!*1yhYKp(I zsh4G>rgey0-(+yQ(q(-DgYJiIZnhn`Ho{2u2Z>N$5x(l1! z>%5GcsP(*^xFWr?qzk_)czkMMF*82sI5E9%iWh0_GLlh(qs^B8;yKdQUlSKJi~aMt zjz|9hg|RuQ%MtlAUMS0qItcmt5U%5Cn&uV4a!Oy42IhPso+>pwd^_or_u;F-!Gi$+ zXElTQ#CKcYhA)}+OT3iZ(VwFD(1%E=F0`BC%^`AW~Tm51wQ0p`mCnuP&qm_3#v-d<`Ufnx6*&oh<9({{~(t+4=N+Wb%5^3MR&`%tKXq0ESt%M|3<1&3*uB}7XLkB<;7=U7LyZa0ic z`_6d{g>h$zC+7;Ac(yd+%$=Bn8#C@#Xm|Xj&gs6{C#2djCzPl)xcApzc1B*(x?5xe zav_mJXcz~BLJ19oh)hqZR?D22G7NBp6MPv;_j^2v3LO^ZIhYx;jzE1Lu?9@)fy#xa&T`}$3PZRc;h(_U zDhl}?67q$$2T&p>5Wyca(46`)Y!yh4jQSDY*rBE|groA3^?A>bK14Ng4sg8(xB&A8 zkSI(tWGspkAyn!@{YMlX0o@@|6BtiyyAtw2_N0Aj{0#k7IBFOzK3dmWdk%X~48e}k z*PY?HsBcqqkJn$P*+B#oxfk(h#&)2#=JqjQ25cH34QzeSS`n$-$-DAR5ta*}CXt3f zky4}M@~;_~Frp0*7gAeewuyT?x#%^$)mC)nuiS?rC(Z2_r5qAaamfE1$pnWUa4$Cs z)jd7f=ntENyIi^#O-Tmmc4H%Va#@Mw$>f5PE{3)SInH(L=}&kP{wpBHOqh;JOaB2y*AVJqyAE$jc%&1@uuqSkw!6IYjNFD$g>loswf_%>6OLgq>M*R) zOJ($?jIdr$O^yiu7tkx-7tUw$RBv;9SM+a2To|!MOy^8v>A!J#ru9@m<@yVF4nn4b zrpQq=qbSo!f%^L>Vlg{b$AthaSQ{s4df@(HdVsh{RvBiiR(3I83^NQy${jSAnb+3| zs343^ryll2N`Q#&e zZo;9w`8H!F_`DEFGO-3=q;s?UVh1+C4pmvb$+f2E-jpwe&tE+{{)gYL@Ilj2=U9FY{SOn#q;{1Uv6bQ6Z51?er5457O9WZD^0oWdh_kY ziR9ZJ99vJ=&PBkxj3uTb!pBG=m6+5y1B)q!i)@$Z8%feyqEyV)*O&ajjl5J6$P@8n!7>8nbH&Hg6BL@@;HxoP0{@ zq6YHT2tKGAMy7lpZES>!Pgn)9i1^(NecpkLQHA`r*%@WQl#Mgaf)<|n8ma2@%S}o0 zAfrw%|IY2ot-H5w&rH2jefLJCGIjGt_13h67r%Ss{=M7IbPw3M&xvSy>9xju7>HTf zLMAZs<2AXDhQ>Fdlp0}6zD#r8MBTZ{Vu}UBpUyQFbl=5?5QER58g(d~9G<+Z)upyY z9uDoy5Rd*2ihMS26`<0o+WDKs=Z!^t(NsjOU&`I6@iDD_YlxZt1Cry<7Yc(Tg^|ME z!r{VU`IJvC#<+X_s!=Y(SjFV0M>{EDsvqTm43wl0l3w|kd;uvzZElvIP}RdANWKP? y@0p!5e3LgJ_hn2!lV0JmwgrNPd?rm|;L1+yD)~TS&>BQOgn5v`Y@u)D&VK>kVxyJ- From 3b5e3f9be4b97f15aac809b851cb328bbf424437 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 18:05:06 +0800 Subject: [PATCH 28/54] update checkpoint unittest --- python/paddle/fluid/tests/unittests/test_checkpoint.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py index b8d82c59b4..150e8822d5 100644 --- a/python/paddle/fluid/tests/unittests/test_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py @@ -14,6 +14,7 @@ import paddle.fluid as fluid import unittest +import os class TestCheckpoint(unittest.TestCase): @@ -35,8 +36,8 @@ class TestCheckpoint(unittest.TestCase): trainer_args = ["epoch_id", "step_id"] epoch_id, step_id = fluid.io.load_trainer_args( self.dirname, serial, self.trainer_id, trainer_args) - self.assertEqual(self.step_id, step_id) - self.assertEqual(self.epoch_id, epoch_id) + self.assertEqual(self.step_id, int(step_id)) + self.assertEqual(self.epoch_id, int(epoch_id)) program = fluid.Program() with fluid.program_guard(program): @@ -44,6 +45,7 @@ class TestCheckpoint(unittest.TestCase): fluid.io.load_checkpoint(exe, self.dirname, serial, program) fluid.io.clean_checkpoint(self.dirname, delete_dir=True) + self.assertFalse(os.path.isdir(self.dirname)) def save_checkpoint(self): config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints, From 6db240d78b3b515a1b2d885e8cc6d8e0b2ffd638 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 19:25:55 +0800 Subject: [PATCH 29/54] update trainer about epoch_id and step id --- python/paddle/fluid/trainer.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index fbdd28f53e..4ffc206458 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -188,7 +188,7 @@ class Trainer(object): if not self.checkpoint.is_pserver: epoch_id, step_id = io.load_trainer_args( self.checkpoint.checkpoint_dir, self.checkpoint.load_serial, - self.trainer_id, ["epoch_id", "step_id"]) + self.trainer_id, self._get_checkpoint_load_args()) self.checkpoint.epoch_id = int(epoch_id) self.checkpoint.step_id = int(step_id) @@ -432,22 +432,33 @@ class Trainer(object): return io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir) + def _get_checkpoint_load_args(self): + """ + epoch_id and step_id are runtime arguments, they are not variables, will load them independently. + """ + return ["epoch_id", "step_id"] + + def _get_checkpoint_save_args(self, epoch_id, step_id): + """ + epoch_id and step_id are runtime arguments, they are not variables, will save them independently. + """ + trainer_args = {} + trainer_args["epoch_id"] = epoch_id + trainer_args["step_id"] = step_id + return trainer_args + def _save_checkpoint(self, epoch_id, step_id): if not self.checkpoint: return if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0: - trainer_args = {} - trainer_args["epoch_id"] = epoch_id - trainer_args["step_id"] = step_id - exe = executor.Executor(self.place) io.save_checkpoint( executor=exe, checkpoint_dir=self.checkpoint.checkpoint_dir, trainer_id=self.trainer_id, is_chief=self.chief, - trainer_args=trainer_args, + trainer_args=self._get_checkpoint_save_args(epoch_id, step_id), main_program=self.train_program, max_num_checkpoints=self.checkpoint.max_num_checkpoints) From f28f41dbcdb0479d98682b94eb13db95112de424 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 19:40:41 +0800 Subject: [PATCH 30/54] update io.py annotations and codes --- python/paddle/fluid/io.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 5abadc73f7..8fcc778709 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -483,11 +483,11 @@ def save_checkpoint(executor, :param main_program :param max_num_checkpoints """ - if checkpoint_dir is None: - raise ValueError("The values of 'checkpoint_dir' should not be None") + if checkpoint_dir.strip() is None: + raise ValueError("'checkpoint_dir' should not be None") - if trainer_args and not isinstance(trainer_args, dict): - raise TypeError("The type of 'trainer_args' should be dict") + if trainer_args: + assert isinstance(trainer_args, dict) if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) @@ -514,11 +514,11 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program): :param main_program """ - if checkpoint_dir is None: - raise ValueError("The values of 'checkpoint_dir' should not be None") + if checkpoint_dir.strip() is None: + raise ValueError("'checkpoint_dir' should not be None") if serial is None or serial < 0: - raise ValueError("The values of 'serial' should not be None or <0 ") + raise ValueError("'serial' should not be None or <0 ") if main_program is None: raise ValueError('main_program should not be None.') @@ -536,8 +536,8 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): :param delete_dir """ - if checkpoint_dir is None: - raise ValueError("The values of 'checkpoint_dir' should not be None") + if checkpoint_dir.strip() is None: + raise ValueError("'checkpoint_dir' should not be None") _lru_delete(checkpoint_dir, max_num_checkpoints=0) if delete_dir and not os.listdir(checkpoint_dir): @@ -590,8 +590,8 @@ def save_persist_vars_without_grad(executor, dirname, program): def save_trainer_args(dirname, trainer_id, trainer_args): - if not isinstance(trainer_args, dict): - raise TypeError("The type of 'trainer_args' should be dict") + assert isinstance(trainer_args, dict) + cur_dir = _get_trainer_dir(dirname, trainer_id) for name, value in trainer_args.iteritems(): @@ -602,12 +602,11 @@ def save_trainer_args(dirname, trainer_id, trainer_args): def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): + assert isinstance(trainer_args, list) + cur_dir = _get_serial_dir(checkpoint_dir, serial) cur_dir = _get_trainer_dir(cur_dir, trainer_id) - if not isinstance(trainer_args, list): - raise TypeError("The type of 'trainer_args' should be list") - ret_values = [] for arg in trainer_args: From 53409a29d889903ec1414d72f0455fe4ef6588a6 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 5 Jun 2018 22:00:30 +0800 Subject: [PATCH 31/54] code optimized --- python/paddle/fluid/trainer.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 4ffc206458..9882d5cda0 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -141,14 +141,10 @@ class Trainer(object): self.chief = True self.checkpoint = checkpoint_config if self.checkpoint: - if not isinstance(self.checkpoint, CheckpointConfig): - raise TypeError( - "The checkpoint_config shoule be an instance of CheckpointConfig" - ) - else: - serial = io.get_latest_checkpoint_serial( - self.checkpoint.checkpoint_dir) - self.checkpoint.load_serial = serial if serial >= 0 else None + assert isinstance(self.checkpoint, CheckpointConfig) + serial = io.get_latest_checkpoint_serial( + self.checkpoint.checkpoint_dir) + self.checkpoint.load_serial = serial if serial >= 0 else None self.scope = core.Scope() @@ -385,8 +381,8 @@ class Trainer(object): else: metrics = exe.run(feed=data, fetch_list=[]) - event_handler(EndStepEvent(epoch_id, step_id, metrics)) self._save_checkpoint(epoch_id, step_id) + event_handler(EndStepEvent(epoch_id, step_id, metrics)) event_handler(EndEpochEvent(epoch_id)) self._clean_checkpoint() From 2f44585e831578b58b53ce5d4b6adcb0275530ce Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 6 Jun 2018 17:26:51 +0800 Subject: [PATCH 32/54] code optimized --- python/paddle/fluid/io.py | 42 +++++++------ .../fluid/tests/unittests/test_checkpoint.py | 3 +- python/paddle/fluid/trainer.py | 60 +++++++++---------- 3 files changed, 52 insertions(+), 53 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 8fcc778709..34c527b62f 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -476,14 +476,14 @@ def save_checkpoint(executor, to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, The interval between two saved checkpoints must greater than save_interval_secs. - :param executor - :param checkpoint_dir - :param trainer_id - :param is_chief - :param main_program - :param max_num_checkpoints - """ - if checkpoint_dir.strip() is None: + :param executor executor for save the value + :param checkpoint_dir the checkpoint directory + :param trainer_id currect trainer id + :param is_chief if the trainer id equals 0, the is_chief will be true + :param main_program will save all variables in program + :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints + """ + if checkpoint_dir is None: raise ValueError("'checkpoint_dir' should not be None") if trainer_args: @@ -500,7 +500,7 @@ def save_checkpoint(executor, if is_chief: save_persist_vars_without_grad(executor, cur_dir, main_program) - _lru_delete(checkpoint_dir, max_num_checkpoints) + _scroll_delete(checkpoint_dir, max_num_checkpoints) def load_checkpoint(executor, checkpoint_dir, serial, main_program): @@ -508,13 +508,13 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program): Load checkpoint from a directory by executor, it will find the most recent saved checkpoint file and load it auto. - :param executor - :param checkpoint_dir - :param serial - :param main_program + :param executor executor for load the value + :param checkpoint_dir the checkpoint directory + :param serial the serial folder in checkpoint directory will be load + :param main_program will load all variables in program """ - if checkpoint_dir.strip() is None: + if checkpoint_dir is None: raise ValueError("'checkpoint_dir' should not be None") if serial is None or serial < 0: @@ -536,9 +536,9 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): :param delete_dir """ - if checkpoint_dir.strip() is None: + if checkpoint_dir is None: raise ValueError("'checkpoint_dir' should not be None") - _lru_delete(checkpoint_dir, max_num_checkpoints=0) + _scroll_delete(checkpoint_dir, max_num_checkpoints=0) if delete_dir and not os.listdir(checkpoint_dir): os.rmdir(checkpoint_dir) @@ -681,7 +681,7 @@ def _get_trainer_dir(dirname, trainer_id): return trainer_dir -def _lru_delete(dirname, max_num_checkpoints=3): +def _scroll_delete(dirname, max_num_checkpoints=3): dirs = os.listdir(dirname) serial_map = {} for serial in dirs: @@ -717,7 +717,7 @@ def get_latest_checkpoint_serial(checkpoint_dir): :param checkpoint_dir """ - if not checkpoint_dir.strip(): + if not checkpoint_dir: return -1 def has_success(checkpoint_dir, cur_dir): @@ -726,10 +726,8 @@ def get_latest_checkpoint_serial(checkpoint_dir): """ serial = _get_dir_serial(cur_dir) - if serial == -1: - return -1 - - if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)): + if serial == -1 or not os.path.isdir( + os.path.join(checkpoint_dir, cur_dir)): return -1 success_path = os.path.join( diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py index 150e8822d5..cf70dfd448 100644 --- a/python/paddle/fluid/tests/unittests/test_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py @@ -15,11 +15,12 @@ import paddle.fluid as fluid import unittest import os +import tempfile class TestCheckpoint(unittest.TestCase): def setUp(self): - self.dirname = "/tmp/ckpt" + self.dirname = tempfile.mktemp() self.max_num_checkpoints = 3 self.epoch_interval = 1 self.step_interval = 1 diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 9882d5cda0..e5cec4c76a 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -132,19 +132,18 @@ class Trainer(object): # 1. we need to generate a framework.Program by calling # program_func. Reference: fluid.program_guard in # test_word2vec.py - if not isinstance(optimizer, opt_module.Optimizer): - raise TypeError("The optimizer should be an instance of Optimizer") + assert isinstance(optimizer, opt_module.Optimizer) # config for checkpoint # only chief worker will save variables self.trainer_id = 0 self.chief = True - self.checkpoint = checkpoint_config - if self.checkpoint: - assert isinstance(self.checkpoint, CheckpointConfig) + self.checkpoint_cfg = checkpoint_config + if self.checkpoint_cfg: + assert isinstance(self.checkpoint_cfg, CheckpointConfig) serial = io.get_latest_checkpoint_serial( - self.checkpoint.checkpoint_dir) - self.checkpoint.load_serial = serial if serial >= 0 else None + self.checkpoint_cfg.checkpoint_dir) + self.checkpoint_cfg.load_serial = serial if serial >= 0 else None self.scope = core.Scope() @@ -174,19 +173,20 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if self.checkpoint and self.checkpoint.load_serial: + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial: with self._prog_and_scope_guard(): exe = executor.Executor(place) - io.load_checkpoint(exe, self.checkpoint.checkpoint_dir, - self.checkpoint.load_serial, + io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir, + self.checkpoint_cfg.load_serial, self.startup_program) - if not self.checkpoint.is_pserver: + if not self.checkpoint_cfg.is_pserver: epoch_id, step_id = io.load_trainer_args( - self.checkpoint.checkpoint_dir, self.checkpoint.load_serial, - self.trainer_id, self._get_checkpoint_load_args()) - self.checkpoint.epoch_id = int(epoch_id) - self.checkpoint.step_id = int(step_id) + self.checkpoint_cfg.checkpoint_dir, + self.checkpoint_cfg.load_serial, self.trainer_id, + self._get_checkpoint_load_args()) + self.checkpoint_cfg.epoch_id = int(epoch_id) + self.checkpoint_cfg.step_id = int(step_id) if param_path and os.path.isdir(param_path): # load params from param_path into scope @@ -256,7 +256,7 @@ class Trainer(object): t.transpile( self.trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": - if self.checkpoint: + if self.checkpoint_cfg: self.is_pserver = True self.train_program = t.get_pserver_program(current_endpoint) @@ -351,10 +351,10 @@ class Trainer(object): self._train_by_any_executor(event_handler, exe, num_epochs, reader) def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): - if self.checkpoint: + if self.checkpoint_cfg: epochs = [ epoch_id for epoch_id in range(num_epochs) - if epoch_id >= self.checkpoint.epoch_id + if epoch_id >= self.checkpoint_cfg.epoch_id ] else: epochs = [epoch_id for epoch_id in range(num_epochs)] @@ -366,8 +366,8 @@ class Trainer(object): self._clean_checkpoint() return - if self.checkpoint and self.checkpoint.load_serial \ - and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id: + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \ + and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id: continue begin_event = BeginStepEvent(epoch_id, step_id) @@ -381,10 +381,12 @@ class Trainer(object): else: metrics = exe.run(feed=data, fetch_list=[]) - self._save_checkpoint(epoch_id, step_id) + if self.checkpoint_cfg: + self._save_checkpoint(epoch_id, step_id) event_handler(EndStepEvent(epoch_id, step_id, metrics)) event_handler(EndEpochEvent(epoch_id)) - self._clean_checkpoint() + if self.checkpoint_cfg: + self._clean_checkpoint() def _test_by_executor(self, reader, feed_order, fetch_list): with executor.scope_guard(self.scope): @@ -424,9 +426,8 @@ class Trainer(object): return self._get_parallel_executor() def _clean_checkpoint(self): - if not self.checkpoint: - return - io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir) + assert self.checkpoint_cfg + io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) def _get_checkpoint_load_args(self): """ @@ -444,19 +445,18 @@ class Trainer(object): return trainer_args def _save_checkpoint(self, epoch_id, step_id): - if not self.checkpoint: - return + assert self.checkpoint_cfg - if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0: + if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0: exe = executor.Executor(self.place) io.save_checkpoint( executor=exe, - checkpoint_dir=self.checkpoint.checkpoint_dir, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, trainer_id=self.trainer_id, is_chief=self.chief, trainer_args=self._get_checkpoint_save_args(epoch_id, step_id), main_program=self.train_program, - max_num_checkpoints=self.checkpoint.max_num_checkpoints) + max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) def build_feed_var_list(program, feed_order): From 7fbddaa64a086d1cd9bf3a9811b2b153918ed84a Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 6 Jun 2018 20:41:21 +0800 Subject: [PATCH 33/54] bug fix --- python/paddle/fluid/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 444162664d..5230ded7db 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -365,7 +365,8 @@ class Trainer(object): event_handler(BeginEpochEvent(epoch_id)) for step_id, data in enumerate(reader()): if self.__stop: - self._clean_checkpoint() + if self.checkpoint_cfg: + self._clean_checkpoint() return if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \ From e0895e49dc75e93809c232d578ed1b31d423ae16 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 6 Jun 2018 20:53:31 +0800 Subject: [PATCH 34/54] remove some seems unused codes. --- paddle/fluid/operators/detail/request_handler.h | 7 ------- .../fluid/operators/detail/request_handler_impl.cc | 5 ----- paddle/fluid/operators/listen_and_serv_op.cc | 12 ------------ 3 files changed, 24 deletions(-) diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h index 4bc5e7f10e..d74206aaba 100644 --- a/paddle/fluid/operators/detail/request_handler.h +++ b/paddle/fluid/operators/detail/request_handler.h @@ -80,7 +80,6 @@ class RequestHandler { } framework::ProgramDesc* program() { return program_; } framework::Executor* executor() { return executor_; } - std::vector& sparse_vars() { return sparse_vars_; } // This function processes user's rpc request. // The implemention is in request_handler_impl. @@ -113,13 +112,7 @@ class RequestHandler { std::unordered_map>* grad_to_prepared_ctx_; - - // Record received sparse variables, so that - // we could reset those after execute optimize program - std::vector sparse_vars_; RPCServer* rpc_server_; - - std::mutex sparse_var_mutex_; }; } // namespace detail diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc index f16c06d52f..145ee53107 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/detail/request_handler_impl.cc @@ -63,11 +63,6 @@ bool RequestSendHandler::Handle(const std::string& varname, PADDLE_THROW("sync: Can not find server side var"); return false; } - - if (invar->IsType()) { - std::unique_lock lock(sparse_var_mutex_); - sparse_vars_.push_back(invar); - } } return true; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 66a0f87b46..0c9d2b5a74 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -108,9 +108,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, std::shared_ptr(nullptr)); rpc_service_->ResetBarrierCounter(); - // Record received sparse variables, so that - // we could reset those after execute optimize program - std::vector sparse_vars; while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. @@ -146,15 +143,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, recv_scope); VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; - // Reset the received sparse variables, the sum operator would not - // sum the input sparse variables which rows is empty at the next - // mini-batch. - // TODO(Yancey1989): move the reset action into an operator, we couldn't - // have any hide logic in the operator. - for (framework::Variable *var : sparse_vars) { - var->GetMutable()->mutable_rows()->clear(); - } - rpc_service_->SetCond(detail::kRequestGet); rpc_service_->WaitBarrier(detail::kRequestGet); rpc_service_->ResetBarrierCounter(); From 9e026a93cff29f1d49fac900b3110968da8594cf Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 7 Jun 2018 16:59:53 +0800 Subject: [PATCH 35/54] remove chief --- python/paddle/fluid/io.py | 6 ++---- python/paddle/fluid/trainer.py | 5 +---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 34c527b62f..6323c9899e 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -466,7 +466,6 @@ CHECKPOINT_SEPARATOR = "_" def save_checkpoint(executor, checkpoint_dir, trainer_id, - is_chief=False, trainer_args=None, main_program=None, max_num_checkpoints=3): @@ -478,8 +477,7 @@ def save_checkpoint(executor, :param executor executor for save the value :param checkpoint_dir the checkpoint directory - :param trainer_id currect trainer id - :param is_chief if the trainer id equals 0, the is_chief will be true + :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief :param main_program will save all variables in program :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints """ @@ -497,7 +495,7 @@ def save_checkpoint(executor, save_trainer_args(cur_dir, trainer_id, trainer_args) - if is_chief: + if trainer_id == 0: save_persist_vars_without_grad(executor, cur_dir, main_program) _scroll_delete(checkpoint_dir, max_num_checkpoints) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 5230ded7db..2737f1c70d 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -136,7 +136,6 @@ class Trainer(object): # config for checkpoint # only chief worker will save variables self.trainer_id = 0 - self.chief = True self.checkpoint_cfg = checkpoint_config if self.checkpoint_cfg: assert isinstance(self.checkpoint_cfg, CheckpointConfig) @@ -201,7 +200,6 @@ class Trainer(object): self.nccl_id_var = None else: self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) - self.chief = self.trainer_id == 0 port = os.getenv("PADDLE_PSERVER_PORT") worker_ips = os.getenv("PADDLE_TRAINER_IPS") worker_endpoints = [] @@ -250,7 +248,7 @@ class Trainer(object): # the unique trainer id, starting from 0, needed by trainer # only self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) - self.chief = self.trainer_id == 0 + # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") with self._prog_and_scope_guard(): @@ -456,7 +454,6 @@ class Trainer(object): executor=exe, checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, trainer_id=self.trainer_id, - is_chief=self.chief, trainer_args=self._get_checkpoint_save_args(epoch_id, step_id), main_program=self.train_program, max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) From 4f46a98fa90b9ddbcc88531079531803820874d6 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 7 Jun 2018 19:06:16 +0800 Subject: [PATCH 36/54] stash --- paddle/fluid/operators/crop_op.cc | 19 +++++++++++++++++- paddle/fluid/operators/crop_op.h | 33 ++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 669b3bbe9d..b5b31c7ce0 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -60,13 +60,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker { "The input used as reference for cropping, " "which is of the same dimensions as X.") .AsDispensable(); + AddInput("Offsets", + "The input used to describe offsets in runtime, which is a " + "1-D vector whose size equals to the rank of input 'X'. The " + "elements data type must be int.") + .AsDispensable(); AddOutput("Out", "The output of crop op, " "which is of the same dimensions as X."); AddAttr>("offsets", "A list describing offsets to be cropped. " "The size of offsets list should be the same as " - "the dimension size of input X."); + "the dimension size of input X.") + .SetDefault(std::vector()); AddAttr>("shape", "A list describing the shape of output. " "The size of shape list should be the same as " @@ -77,6 +83,17 @@ Crop Operator. Crop input into output, as specified by offsets and shape. +There are two ways to set the offsets: +1. In runtime: Using the input 'Offsets', which is a Vairbale and can be + output of other operators. This way is suitable for + dynamic offsets. +2. In network configuration: Using the attribute 'offsets', which will be + set in Python configure script. This way is + suitable for fixed offsets. +You CANNOT use these two ways at the same time. An exception will be raised +if input 'Offset' is configured and meanwhile the attribute 'offsets' is +not empty. + There are two ways to set shape: 1. reference input: crop input X into the same shape as reference input. The dimension of reference input should diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index f05c2e2328..d8e9f086cc 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -27,6 +27,32 @@ template ; using framework::Tensor; +static std::vector GetOffsets(const framework::ExecutionContext& ctx) { + std::vector res; + int rank = ctx.Input("X")->dims().size(); + if (ctx.HasInput("Offsets")) { + PADDLE_ENFORCE(ctx.Attr>("offsets").empty(), + "Input 'Offsets' and attribute 'offsets' should not be used " + "at the same time."); + const auto* offsets_tensor = ctx.Input("Offsets"); + PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1); + PADDLE_ENFORCE_EQ( + rank, offsets_tensor->dims()[0], + "Offsets size should be equal to dimension size of input tensor."); + const int* offsets_data = offsets_tensor->data(); + res.resize(rank); + for (size_t i = 0; i < rank; ++i) { + res[i] = offsets_data[i]; + } + } else { + res = ctx.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + rank, res.size(), + "Offsets size should be equal to dimension size of input tensor."); + } + return res; +} + template class CropKernel : public framework::OpKernel { public: @@ -37,10 +63,7 @@ class CropKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); auto x_stride = framework::stride(x->dims()); auto out_stride = framework::stride(out->dims()); - auto offsets = context.Attr>("offsets"); - PADDLE_ENFORCE_EQ( - x->dims().size(), static_cast(offsets.size()), - "Offsets size should be equal to dimension size of input tensor."); + auto offsets = GetOffsets(context); int64_t offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { offset += (x_stride[i] * offsets[i]); @@ -56,7 +79,7 @@ void CropGradFunction(const framework::ExecutionContext& context) { if (d_x != nullptr) { auto* d_out = context.Input(framework::GradVarName("Out")); d_x->mutable_data(context.GetPlace()); - auto offsets = context.Attr>("offsets"); + auto offsets = GetOffsets(context); Eigen::array, D> paddings; for (size_t i = 0; i < D; ++i) { paddings[i].first = offsets[i]; From 9dee93384546f77d856a1d2906bd2e10320a0046 Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Thu, 7 Jun 2018 19:20:33 +0800 Subject: [PATCH 37/54] Remove warning for rst file --- doc/v2/build_and_install/build_from_source_cn.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst index de7e9eb75c..6421c53082 100644 --- a/doc/v2/build_and_install/build_from_source_cn.rst +++ b/doc/v2/build_and_install/build_from_source_cn.rst @@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 学习 Docker 有多难? - 理解 Docker 并不难,大概花十分钟看一下 `这篇文章 `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 + 理解 Docker 并不难,大概花十分钟看一下 `如何使用Docker `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 - 我可以用 IDE 吗? @@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 可以并行编译吗? - 是的。我们的 Docker image 运行一个 `Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 + 是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 - Docker 需要 sudo @@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 - 在 Windows/MacOS 上编译很慢 - Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `这个issue `_ 。 + Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 `_ 。 - 磁盘不够 - 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `这篇文章 `_ 来清理这些内容。 + 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `如何删除Docker Container `_ 来清理这些内容。 .. _compile_deps: @@ -195,7 +195,7 @@ BLAS PaddlePaddle支持 `MKL `_ 和 `OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, -还会下载MKL-DNN数学库,详细参考 `这里 `_ 。 +还会下载MKL-DNN数学库,详细参考 `mkldnn设计文档 `_ 。 如果关闭MKL,则会使用OpenBLAS作为BLAS库。 From 2dd66ef65e968ead7653402c0904c3fb49fb12ab Mon Sep 17 00:00:00 2001 From: yuyang18 Date: Thu, 7 Jun 2018 19:28:40 +0800 Subject: [PATCH 38/54] Do not generate doc for op_role and op_attr_name --- python/paddle/fluid/framework.py | 7 +++++++ python/paddle/fluid/layers/layer_function_generator.py | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 33b5caa0ea..3d1dc82da7 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -361,6 +361,13 @@ class OpProtoHolder(object): raise ValueError("Operator \"%s\" has not been registered." % type) return self.op_proto_map[type] + @staticmethod + def generated_op_attr_names(): + return { + core.op_proto_and_checker_maker.kOpRoleAttrName(), + core.op_proto_and_checker_maker.kOpRoleVarAttrName() + } + class Operator(object): """ diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 295d1b7190..72cab81d41 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -75,7 +75,11 @@ def _generate_doc_string_(op_proto): buf.write(str(each_input.dispensable)) buf.write('\n') + skip_attrs = OpProtoHolder.generated_op_attr_names() + for each_attr in op_proto.attrs: + if each_attr.name in skip_attrs: + continue buf.write(' ') buf.write(each_attr.name) buf.write(' (') From 9c61409a18def0709dc362df00543eea624fc214 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 7 Jun 2018 20:25:33 +0800 Subject: [PATCH 39/54] Make crop op supporting taking offsets as one of its inputs --- paddle/fluid/operators/crop_op.cc | 16 +++++++++++++ paddle/fluid/operators/random_crop_op.cc | 1 - .../fluid/tests/unittests/test_crop_op.py | 23 ++++++++++++++++++- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index b5b31c7ce0..5b5a220cf9 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", y_dim); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; class CropOpMaker : public framework::OpProtoAndCheckerMaker { @@ -163,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Out")) + ->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index b14b559e31..d3a32b664b 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py index 20cc3a643f..4016089c01 100644 --- a/python/paddle/fluid/tests/unittests/test_crop_op.py +++ b/python/paddle/fluid/tests/unittests/test_crop_op.py @@ -42,9 +42,9 @@ class TestCropOp(OpTest): def setUp(self): self.op_type = "crop" self.crop_by_input = False + self.offset_by_input = False self.attrs = {} self.initTestCase() - self.attrs['offsets'] = self.offsets if self.crop_by_input: self.inputs = { 'X': np.random.random(self.x_shape).astype("float32"), @@ -55,6 +55,10 @@ class TestCropOp(OpTest): self.inputs = { 'X': np.random.random(self.x_shape).astype("float32"), } + if self.offset_by_input: + self.inputs['Offsets'] = np.array(self.offsets).astype('int32') + else: + self.attrs['offsets'] = self.offsets self.outputs = { 'Out': crop(self.inputs['X'], self.offsets, self.crop_shape) } @@ -101,5 +105,22 @@ class TestCase4(TestCropOp): self.crop_by_input = True +class TestCase5(TestCropOp): + def initTestCase(self): + self.x_shape = (3, 4, 5) + self.crop_shape = [2, 2, 3] + self.offsets = [1, 0, 2] + self.offset_by_input = True + + +class TestCase6(TestCropOp): + def initTestCase(self): + self.x_shape = (10, 9, 14) + self.crop_shape = [3, 3, 5] + self.offsets = [3, 5, 4] + self.crop_by_input = True + self.offset_by_input = True + + if __name__ == '__main__': unittest.main() From e80c6b3c24eca373be3962c560e67be09fe6fe38 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 7 Jun 2018 22:02:26 +0800 Subject: [PATCH 40/54] Refine API doc string --- paddle/fluid/operators/activation_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index dd71c66a75..23327b77f8 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -24,12 +24,12 @@ namespace operators { : public ::paddle::framework::OpProtoAndCheckerMaker { \ public: \ void Make() override { \ - AddInput("X", "Input of " #OP_NAME "operator"); \ - AddOutput("Out", "Output of" #OP_NAME "operator"); \ + AddInput("X", "Input of " #OP_NAME " operator"); \ + AddOutput("Out", "Output of " #OP_NAME " operator"); \ AddAttr("use_mkldnn", \ "(bool, default false) Only used in mkldnn kernel") \ .SetDefault(false); \ - AddComment(#OP_COMMENT); \ + AddComment(OP_COMMENT); \ } \ } From 1239fce771a5d0907045fc285cb1966bdb61b180 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 8 Jun 2018 09:59:54 +0800 Subject: [PATCH 41/54] polish sparse update code --- .../fluid/operators/detail/request_handler_impl.cc | 3 +++ paddle/fluid/operators/detail/rpc_server.cc | 13 +++++++++++++ paddle/fluid/operators/detail/rpc_server.h | 6 ++++++ paddle/fluid/operators/listen_and_serv_op.cc | 1 + 4 files changed, 23 insertions(+) diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc index 145ee53107..b5ee3ab51e 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/detail/request_handler_impl.cc @@ -63,6 +63,9 @@ bool RequestSendHandler::Handle(const std::string& varname, PADDLE_THROW("sync: Can not find server side var"); return false; } + if (invar->IsType()) { + rpc_server_->RecordSparseVar(invar); + } } return true; diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc index 448763372a..7feddbeca8 100644 --- a/paddle/fluid/operators/detail/rpc_server.cc +++ b/paddle/fluid/operators/detail/rpc_server.cc @@ -73,6 +73,19 @@ void RPCServer::ResetBarrierCounter() { t.second = 0; } } +void RPCServer::RecordSparseVar(framework::Variable* sparse_var) { + std::unique_lock lock(mutex_sparse_var_recorder_); + sparse_vars_.push_back(sparse_var); +} + +void RPCServer::ResetSparseVarsRecorder() { + VLOG(3) << "RPCServer reset sparse vars recorder."; + std::unique_lock lock(mutex_sparse_var_recorder_); + for (auto* var : sparse_vars_) { + var->GetMutable()->mutable_rows()->clear(); + } + sparse_vars_.clear(); +} void RPCServer::RegisterRPC(const std::string& rpc_name, RequestHandler* handler, int thread_num) { diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h index c2e7ae706c..94a21ef8d0 100644 --- a/paddle/fluid/operators/detail/rpc_server.h +++ b/paddle/fluid/operators/detail/rpc_server.h @@ -60,7 +60,10 @@ class RPCServer { void SetCond(const std::string& rpc_name); void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); + void ResetBarrierCounter(); + void RecordSparseVar(framework::Variable* sparse_var); + void ResetSparseVarsRecorder(); protected: virtual void ShutDownImpl() = 0; @@ -74,6 +77,9 @@ class RPCServer { std::atomic cur_cond_; std::condition_variable rpc_cond_; + std::vector sparse_vars_; + std::mutex mutex_sparse_var_recorder_; + protected: std::string bind_address_; std::atomic exit_flag_; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 0c9d2b5a74..ee7b01a54c 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -146,6 +146,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, rpc_service_->SetCond(detail::kRequestGet); rpc_service_->WaitBarrier(detail::kRequestGet); rpc_service_->ResetBarrierCounter(); + rpc_service_->ResetSparseVarsRecorder(); } // while(true) } From c7bbfb33ad816762f00e19f5076b1d6fed105b2d Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Fri, 8 Jun 2018 10:39:44 +0800 Subject: [PATCH 42/54] Fix a GPU bug --- paddle/fluid/operators/crop_op.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index d8e9f086cc..91cfbbda73 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -39,11 +39,16 @@ static std::vector GetOffsets(const framework::ExecutionContext& ctx) { PADDLE_ENFORCE_EQ( rank, offsets_tensor->dims()[0], "Offsets size should be equal to dimension size of input tensor."); - const int* offsets_data = offsets_tensor->data(); - res.resize(rank); - for (size_t i = 0; i < rank; ++i) { - res[i] = offsets_data[i]; + const int* offsets_data; + framework::Tensor cpu_tmp_tensor; + if (platform::is_cpu_place(offsets_tensor->place())) { + offsets_data = offsets_tensor->data(); + } else { + framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(), + &cpu_tmp_tensor); + offsets_data = cpu_tmp_tensor.data(); } + res = std::vector(offsets_data, offsets_data + rank); } else { res = ctx.Attr>("offsets"); PADDLE_ENFORCE_EQ( From d745840a6ef6f25cae38d9f9e361d6c6b2b96922 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Fri, 8 Jun 2018 17:16:59 +0800 Subject: [PATCH 43/54] fix a small compile error on Mac --- paddle/fluid/framework/details/fuse_vars_op_handle.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc index 32415c192f..018c9bff71 100644 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc @@ -42,7 +42,7 @@ void FuseVarsOpHandle::RunImpl() { out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); s += numel; } - this->RunAndRecordEvent([this] {}); + this->RunAndRecordEvent([] {}); } std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } From 145aaa4b491eb8b174650faa8d1f94754abf2945 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Fri, 8 Jun 2018 17:17:20 +0800 Subject: [PATCH 44/54] loose threshold of TRT for CI in different model (#11305) --- paddle/fluid/inference/tensorrt/convert/ut_helper.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 8613d5b1c1..236d169017 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -151,7 +151,8 @@ class TRTConvertValidation { // Compare two output ASSERT_FALSE(fluid_out.empty()); for (size_t i = 0; i < fluid_out.size(); i++) { - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6); + // Loose the threshold for CI in different machine model. + EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); } } } From 0fec9469f9f6519e6ee31015c6b57f7efbd9880c Mon Sep 17 00:00:00 2001 From: guochaorong Date: Fri, 8 Jun 2018 17:54:47 +0800 Subject: [PATCH 45/54] fix some bugs introduced by unfreed memory --- paddle/fluid/operators/gather_test.cc | 3 ++- paddle/fluid/operators/math/math_function_test.cc | 2 ++ paddle/fluid/platform/device_tracer.cc | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index 9c0561b016..f6b156eb30 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -43,7 +43,8 @@ TEST(Gather, GatherData) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); paddle::operators::CPUGather(ctx, *src, *index, output); - + delete cpu_place; + cpu_place = NULL; for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index 3719a264e9..b545671b43 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) { paddle::platform::CPUDeviceContext context(*cpu_place); GetBlas(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, input3_ptr + 1, 4); + delete cpu_place; + cpu_place = NULL; EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index c9e1063168..3870e69ba7 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -130,6 +130,8 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, uint8_t *buf = reinterpret_cast(malloc(kBufSize + kAlignSize)); *size = kBufSize; *buffer = ALIGN_BUFFER(buf, kAlignSize); + free(buf); + buf = NULL; *maxNumRecords = 0; } From cde7db85b33a6025518e0367e20727daa8f4a6fb Mon Sep 17 00:00:00 2001 From: guochaorong Date: Fri, 8 Jun 2018 19:01:13 +0800 Subject: [PATCH 46/54] fix bad code in python --- python/paddle/fluid/layers/metric.py | 4 ---- .../fluid/tests/unittests/test_dynrnn_gradient_check.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py index cab2eb5551..a1c64ce277 100644 --- a/python/paddle/fluid/layers/metric.py +++ b/python/paddle/fluid/layers/metric.py @@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200): topk_indices = helper.create_tmp_variable(dtype="int64") topk_out, topk_indices = nn.topk(input, k=k) auc_out = helper.create_tmp_variable(dtype="float32") - if correct is None: - correct = helper.create_tmp_variable(dtype="int64") - if total is None: - total = helper.create_tmp_variable(dtype="int64") helper.append_op( type="accuracy", inputs={ diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py index 2232939075..95af51f1b2 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py @@ -30,9 +30,6 @@ class Memory(object): assert val.dtype == self.ex.dtype self.cur = val - def ex(self): - return self.ex - def next(self): self.ex = self.cur self.cur = None From 56964946d4cca97f4341342d85159a2fd4b54496 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 8 Jun 2018 19:41:48 +0800 Subject: [PATCH 47/54] polish sparse update logic --- .../fluid/operators/detail/request_handler_impl.cc | 12 ++++++++++-- .../fluid/operators/detail/request_handler_impl.h | 5 +++++ paddle/fluid/operators/detail/rpc_server.cc | 13 ------------- paddle/fluid/operators/detail/rpc_server.h | 5 ----- paddle/fluid/operators/listen_and_serv_op.cc | 4 +++- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc index b5ee3ab51e..9473dce550 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/detail/request_handler_impl.cc @@ -64,13 +64,21 @@ bool RequestSendHandler::Handle(const std::string& varname, return false; } if (invar->IsType()) { - rpc_server_->RecordSparseVar(invar); + std::unique_lock lock(mutex_sparse_vars_); + sparse_vars_.push_back(invar); } } - return true; } +void RequestSendHandler::ResetSparseVarRecorder() { + std::unique_lock lock(mutex_sparse_vars_); + for (auto* var : sparse_vars_) { + var->GetMutable()->mutable_rows()->clear(); + } + sparse_vars_.clear(); +} + bool RequestGetHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h index 8d0c62232b..443d951914 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.h +++ b/paddle/fluid/operators/detail/request_handler_impl.h @@ -41,6 +41,11 @@ class RequestSendHandler final : public RequestHandler { virtual ~RequestSendHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar) override; + void ResetSparseVarRecorder(); + + private: + std::mutex mutex_sparse_vars_; + std::vector sparse_vars_; }; class RequestGetHandler final : public RequestHandler { diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc index 7feddbeca8..448763372a 100644 --- a/paddle/fluid/operators/detail/rpc_server.cc +++ b/paddle/fluid/operators/detail/rpc_server.cc @@ -73,19 +73,6 @@ void RPCServer::ResetBarrierCounter() { t.second = 0; } } -void RPCServer::RecordSparseVar(framework::Variable* sparse_var) { - std::unique_lock lock(mutex_sparse_var_recorder_); - sparse_vars_.push_back(sparse_var); -} - -void RPCServer::ResetSparseVarsRecorder() { - VLOG(3) << "RPCServer reset sparse vars recorder."; - std::unique_lock lock(mutex_sparse_var_recorder_); - for (auto* var : sparse_vars_) { - var->GetMutable()->mutable_rows()->clear(); - } - sparse_vars_.clear(); -} void RPCServer::RegisterRPC(const std::string& rpc_name, RequestHandler* handler, int thread_num) { diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h index 94a21ef8d0..f809c13c72 100644 --- a/paddle/fluid/operators/detail/rpc_server.h +++ b/paddle/fluid/operators/detail/rpc_server.h @@ -62,8 +62,6 @@ class RPCServer { void IncreaseBatchBarrier(const std::string rpc_name); void ResetBarrierCounter(); - void RecordSparseVar(framework::Variable* sparse_var); - void ResetSparseVarsRecorder(); protected: virtual void ShutDownImpl() = 0; @@ -77,9 +75,6 @@ class RPCServer { std::atomic cur_cond_; std::condition_variable rpc_cond_; - std::vector sparse_vars_; - std::mutex mutex_sparse_var_recorder_; - protected: std::string bind_address_; std::atomic exit_flag_; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index ee7b01a54c..66d31c8895 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -146,7 +146,9 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, rpc_service_->SetCond(detail::kRequestGet); rpc_service_->WaitBarrier(detail::kRequestGet); rpc_service_->ResetBarrierCounter(); - rpc_service_->ResetSparseVarsRecorder(); + // reset received sparse vars to avoid reuse it in the next mini-batch + dynamic_cast(request_send_handler_.get()) + ->ResetSparseVarRecorder(); } // while(true) } From 5c8397a88fe6b062be0c0725bbd14a2c8d4fc2e9 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 8 Jun 2018 19:49:10 +0800 Subject: [PATCH 48/54] remove chief in test --- python/paddle/fluid/tests/unittests/test_checkpoint.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py index cf70dfd448..e22400a045 100644 --- a/python/paddle/fluid/tests/unittests/test_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py @@ -66,9 +66,9 @@ class TestCheckpoint(unittest.TestCase): exe = fluid.Executor(self.place) for i in xrange(10): - fluid.io.save_checkpoint( - exe, config.checkpoint_dir, self.trainer_id, self.chief, - trainer_args, program, config.max_num_checkpoints) + fluid.io.save_checkpoint(exe, config.checkpoint_dir, + self.trainer_id, trainer_args, program, + config.max_num_checkpoints) if __name__ == '__main__': From 310598f99bf130c62fcd3ec9c64bf986136ddbe5 Mon Sep 17 00:00:00 2001 From: guochaorong <32069604+guochaorong@users.noreply.github.com> Date: Fri, 8 Jun 2018 21:43:55 +0800 Subject: [PATCH 49/54] Update device_tracer.cc --- paddle/fluid/platform/device_tracer.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index b79768eba2..1a9be044e0 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -130,8 +130,6 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, uint8_t *buf = reinterpret_cast(malloc(kBufSize + kAlignSize)); *size = kBufSize; *buffer = ALIGN_BUFFER(buf, kAlignSize); - free(buf); - buf = NULL; *maxNumRecords = 0; } From 52e2eb65b9c0d773abc28d520f318a8def3d6d11 Mon Sep 17 00:00:00 2001 From: Siddharth Goyal Date: Fri, 8 Jun 2018 10:40:37 -0700 Subject: [PATCH 50/54] Fix function in fit-a-line with new API (#11020) --- .../tests/book/high-level-api/fit_a_line/test_fit_a_line.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py index b3117cf2e5..ad28c9eff5 100644 --- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py @@ -38,7 +38,7 @@ def inference_program(): return y_predict -def linear(): +def train_program(): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = inference_program() @@ -104,7 +104,7 @@ def main(use_cuda): # Directory for saving the trained model params_dirname = "fit_a_line.inference.model" - train(use_cuda, linear, params_dirname) + train(use_cuda, train_program, params_dirname) infer(use_cuda, inference_program, params_dirname) From 637827a5bc80d6e0a17466e44b087f91601539cb Mon Sep 17 00:00:00 2001 From: Jeff Wang Date: Fri, 8 Jun 2018 15:05:25 -0700 Subject: [PATCH 51/54] Use for_test=True in the Fluid Trainer to clone the test program (#11323) * Use for_test=True in the Fluid Trainer to clone the test program * fix typo * Should do the samething to the inferencer --- python/paddle/fluid/inferencer.py | 2 ++ python/paddle/fluid/trainer.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py index 9f242cf29a..6baac00905 100644 --- a/python/paddle/fluid/inferencer.py +++ b/python/paddle/fluid/inferencer.py @@ -56,6 +56,8 @@ class Inferencer(object): else: self.exe = executor.Executor(self.place) + self.inference_program = self.inference_program.clone(for_test=True) + def infer(self, inputs, return_numpy=True): """ :param inputs: a map of {"input_name": input_var} that will be feed into the inference program diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index cdacb41986..ac313b237e 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -115,9 +115,9 @@ class Trainer(object): program_func_outs = train_func() self.train_func_outputs = program_func_outs if isinstance( program_func_outs, list) else [program_func_outs] - self.test_program = self.train_program.clone() + self.test_program = self.train_program.clone(for_test=True) - # The fisrt element of program_func_outs is loss. + # The first element of program_func_outs is loss. loss = self.train_func_outputs[0] optimizer = optimizer_func() From bf03a2094bce7c542dd64c3a29f445e04c68640b Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 10 Jun 2018 13:24:38 +0800 Subject: [PATCH 52/54] fix distribute_transpiler --- python/paddle/fluid/transpiler/distribute_transpiler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 27992df462..c7ab300e0f 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -177,6 +177,7 @@ class DistributeTranspiler: dtype=table_grad_var.dtype) for index in range(len(self.pserver_endpoints)) ] + return param_list, grad_list def _init_splited_vars(self, slice_var_up): # update these mappings for further transpile: @@ -199,8 +200,8 @@ class DistributeTranspiler: grad_list.append(g) param_grad_set.add(g.name) - self._update_dist_lookup_table_vars(param_list, grad_list, - self.params_grads) + param_list, grad_list = self._update_dist_lookup_table_vars( + param_list, grad_list, self.params_grads) if slice_var_up: # when we slice var up into blocks, we will slice the var according to From 062d5a56b401162ebd0232e42135c87177ad68ec Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 11 Jun 2018 09:42:54 +0800 Subject: [PATCH 53/54] Add comments to a singleton. (#11333) --- paddle/fluid/framework/data_type.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index b6b93cf422..60382faffb 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -28,6 +28,9 @@ struct DataTypeMap { }; static DataTypeMap* InitDataTypeMap(); +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex static DataTypeMap& gDataTypeMap() { static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); return *g_data_type_map_; From 59e10922b41f141ae25b5266275f10921a30d92a Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 11 Jun 2018 10:07:54 +0800 Subject: [PATCH 54/54] Expose maxout Python API. (#11278) * Expose maxout API. * Fix code style. --- python/paddle/fluid/layers/ops.py | 1 + python/paddle/fluid/tests/unittests/test_layers.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 69cfde852d..3260f81e9e 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -73,6 +73,7 @@ __all__ = [ 'sum', 'polygon_box_transform', 'shape', + 'maxout', ] + __activations__ for _OP in set(__all__): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 621a450fa6..8b0ebe3cf5 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -387,6 +387,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_maxout(self): + program = Program() + with program_guard(program): + data = layers.data(name='x', shape=[8, 6, 6], dtype="float32") + output = layers.maxout(x=data, groups=2) + self.assertIsNotNone(output) + print(str(program)) + if __name__ == '__main__': unittest.main()