From 3909108cae29ee035785e7e2fa44f1c7c8bbd9ea Mon Sep 17 00:00:00 2001 From: zhhsplendid <zhhsplendid@gmail.com> Date: Tue, 26 Mar 2019 01:59:25 +0000 Subject: [PATCH 01/19] Add SpectralNormGradOpDescMaker Use SpectralNormGradOpDescMaker instead of DefaultGradOpDescMaker to avoid registering useless variables to improve GPU usage. test=develop --- paddle/fluid/operators/spectral_norm_op.cc | 27 +++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc index 357d055756..04f659a465 100644 --- a/paddle/fluid/operators/spectral_norm_op.cc +++ b/paddle/fluid/operators/spectral_norm_op.cc @@ -10,6 +10,9 @@ limitations under the License. */ #include "paddle/fluid/operators/spectral_norm_op.h" + +#include <memory> + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -156,6 +159,28 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker { } }; +class SpectralNormGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("spectral_norm_grad"); + + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Weight", Input("Weight")); + op->SetInput("U", Input("U")); + op->SetInput("V", Input("V")); + + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + + op->SetAttrMap(Attrs()); + + return op; + } +}; + class SpectralNormOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -185,7 +210,7 @@ class SpectralNormOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(spectral_norm, ops::SpectralNormOp, ops::SpectralNormOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::SpectralNormGradOpDescMaker); REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad); REGISTER_OP_CPU_KERNEL( spectral_norm, From 183bacebe3d822776abdaa93a7f1765dcc0ade54 Mon Sep 17 00:00:00 2001 From: Zhen Wang <wangzhen31@baidu.com> Date: Wed, 27 Mar 2019 16:46:39 +0800 Subject: [PATCH 02/19] clean codes and fix some bugs. test=develop --- .../slim/quantization/quantization_pass.py | 120 ++++++++++-------- .../quantization/quantization_strategy.py | 16 ++- .../slim/tests/quantization/compress.yaml | 2 + .../slim/tests/test_quantization_pass.py | 3 - python/paddle/fluid/framework.py | 80 ++++-------- 5 files changed, 103 insertions(+), 118 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index ab3bd8bd18..3809e32794 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -26,6 +26,17 @@ __all__ = [ ] +def _init_var_node(var_node, value, scope, place): + assert isinstance(value, + np.ndarray), 'The type of value should be numpy array.' + assert scope is not None, \ + 'The scope cannot be set None.' + assert place is not None, \ + 'The place cannot be set None.' + tensor = scope.var(var_node.name()).get_tensor() + tensor.set(value, place) + + class QuantizationTransformPass(object): def __init__(self, scope=None, @@ -88,14 +99,14 @@ class QuantizationTransformPass(object): assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'." if activation_quantize_type not in quant_type: raise ValueError( - "Unknown activation_quantize_type : '%s'. It can only be ", - "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", - str(activation_quantize_type)) + "Unknown activation_quantize_type : '%s'. It can only be " + "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." % + (str(activation_quantize_type))) if weight_quantize_type not in quant_type: raise ValueError( - "Unknown weight_quantize_type: '%s'. It can only be ", - "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", - str(weight_quantize_type)) + "Unknown weight_quantize_type: '%s'. It can only be " + "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'." + % (str(weight_quantize_type))) self._activation_quantize_type = activation_quantize_type self._weight_quantize_type = weight_quantize_type @@ -121,8 +132,6 @@ class QuantizationTransformPass(object): """ assert isinstance(graph, IrGraph), 'graph must be the instance of IrGraph.' - #sequential_execution = core.get_pass('sequential_execution_pass') - #sequential_execution.apply(graph.graph) self._is_test = graph.is_test() # marked the variable which has been dequantized. dequantized_vars = collections.OrderedDict() @@ -203,9 +212,12 @@ class QuantizationTransformPass(object): var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=core.VarDesc.VarType.INT64) - self._init_var_node( - global_step_in, np.zeros( - [1], dtype='int64')) + _init_var_node( + global_step_in, + np.zeros( + [1], dtype='int64'), + self._scope, + self._place) global_step_out = graph.create_var_node_from_desc( global_step_in.var()) # The attribute of `op_role` is needed by ParallelExecutor. @@ -284,7 +296,12 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) + _init_var_node( + scale_in_node, + np.array( + [0.001], dtype=data_type), + self._scope, + self._place) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) inputs = {'X': var_node, 'InScale': scale_in_node} @@ -299,9 +316,13 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node( - scales_node, np.zeros( - [self._window_size], dtype=data_type)) + _init_var_node( + scales_node, + np.zeros( + [self._window_size], dtype=data_type), + self._scope, + self._place) + inputs['Iter'] = self._global_step outputs['OutScales'] = scales_node attrs = { @@ -343,7 +364,12 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) + _init_var_node( + scale_in_node, + np.array( + [0.001], dtype=data_type), + self._scope, + self._place) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) ins = {'X': var_node, 'InScale': scale_in_node} @@ -356,13 +382,23 @@ class QuantizationTransformPass(object): shape=[1]) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.ones([1], dtype=data_type)) + _init_var_node( + scale_in_node, + np.ones( + [1], dtype=data_type), + self._scope, + self._place) accum_in_node = graph.create_persistable_node( name=unique_name.generate('accum'), var_type=core.VarDesc.VarType.LOD_TENSOR, var_dtype=var_node.dtype(), shape=[1]) - self._init_var_node(accum_in_node, np.ones([1], dtype=data_type)) + _init_var_node( + accum_in_node, + np.ones( + [1], dtype=data_type), + self._scope, + self._place) state_out_node = graph.create_var_node_from_desc(state_in_node.var( )) accum_out_node = graph.create_var_node_from_desc(accum_in_node.var( @@ -482,16 +518,6 @@ class QuantizationTransformPass(object): graph.link_to(dequant_op_node, dequant_var_node) return dequant_var_node - def _init_var_node(self, var_node, value): - assert isinstance( - value, np.ndarray), 'The type of value should be numpy array.' - assert self._scope is not None, \ - 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' - assert self._place is not None, \ - 'The place cannot be set None when activation_quantize_type equals to range_abs_max.' - tensor = self._scope.var(var_node.name()).get_tensor() - tensor.set(value, self._place) - def _quantized_var_name(self, var_name): """ Return quantized variable name for the input `var_name`. @@ -594,8 +620,8 @@ class QuantizationFreezePass(object): self._weight_bits) self._restore_var(input_arg_name, quantized_param_v) else: - scale_v = self._to_node(op_node.outputs, - op_node.output('OutScale')[0]) + scale_v = graph._find_node_by_name( + op_node.outputs, op_node.output('OutScale')[0]) self._var_scale_map[input_arg_name] = scale_v ops = graph.all_op_nodes() @@ -627,8 +653,8 @@ class QuantizationFreezePass(object): return graph def _remove_fake_quant_and_dequant_op(self, graph, op_node): - k = self._to_node(op_node.outputs, op_node.output('Out')[0]) - v = self._to_node(op_node.inputs, op_node.input('X')[0]) + k = graph._find_node_by_name(op_node.outputs, op_node.output('Out')[0]) + v = graph._find_node_by_name(op_node.inputs, op_node.input('X')[0]) if v.node not in self._op_input_rename_map: self._op_input_rename_map[k.node] = v else: @@ -663,8 +689,8 @@ class QuantizationFreezePass(object): raise ValueError("Only support one output, but op %s has" " more than one output." % (op_node.name())) - output_var_node = self._to_node(op_node.outputs, - op_node.output_arg_names()[0]) + output_var_node = graph._find_node_by_name( + op_node.outputs, op_node.output_arg_names()[0]) weight_scale_node = graph.create_persistable_node( name=unique_name.generate('channel_scale'), var_type=core.VarDesc.VarType.LOD_TENSOR, @@ -672,7 +698,9 @@ class QuantizationFreezePass(object): var_dtype=output_var_node.dtype()) data_type = 'float64' if output_var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(weight_scale_node, channel_scale.astype(data_type)) + _init_var_node(weight_scale_node, + channel_scale.astype(data_type), self._scope, + self._place) dequant_var_node = graph.create_var_node( name=self._dequantized_var_name(output_var_node.name()), var_type=output_var_node.type(), @@ -724,8 +752,8 @@ class QuantizationFreezePass(object): raise ValueError("Only support one output, but op %s has" " more than one output." % (op_node.name())) - output_var_node = self._to_node(op_node.outputs, - op_node.output_arg_names()[0]) + output_var_node = graph._find_node_by_name( + op_node.outputs, op_node.output_arg_names()[0]) dequant_var_node = graph.create_var_node( name=self._dequantized_var_name(output_var_node.name()), var_type=output_var_node.type(), @@ -746,24 +774,6 @@ class QuantizationFreezePass(object): self._op_output_rename_map[output_var_node.node] = dequant_var_node return dequant_var_node - def _init_var_node(self, var_node, value): - assert isinstance( - value, np.ndarray), 'The type of value should be numpy array.' - assert self._scope is not None, \ - 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' - assert self._place is not None, \ - 'The place cannot be set None when activation_quantize_type equals to range_abs_max.' - tensor = self._scope.var(var_node.name()).get_tensor() - tensor.set(value, self._place) - - def _to_node(self, nodes, node_name): - target_node = None - for n in nodes: - if n.name() == node_name: - target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." - return target_node - def _load_var(self, name): return np.array(self._scope.find_var(name).get_tensor()) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py index 6812b4c633..da3510de39 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py @@ -45,13 +45,14 @@ class QuantizationStrategy(Strategy): activation_bits=8, weight_bits=8, activation_quantize_type='abs_max', + weight_quantize_type='abs_max', save_in_nodes=None, save_out_nodes=None): """ Args: start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0 end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0 - float_model_save_path(str): The path to save model with float weights. + float_model_save_path(str): The path to save model with float weights. None means it doesn't save float model. defalut: None. mobile_model_save_path(str): The path to save model for paddle-mobile execution. None means it doesn't save mobile model. defalut: None. @@ -66,9 +67,11 @@ class QuantizationStrategy(Strategy): dynamically each step in both training and testing period. If use 'range_abs_max', a static quantization scale will be calculated during training and used in inference. - save_in_nodes(list<str>): A list of variable names used to prune graph + weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'. + The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained. + save_in_nodes(list<str>): A list of variable names used to prune graph for saving inference model. - save_out_nodes(list<str>): A list of variable names used to prune graph + save_out_nodes(list<str>): A list of variable names used to prune graph for saving inference model. """ @@ -81,6 +84,7 @@ class QuantizationStrategy(Strategy): self.activation_bits = activation_bits self.weight_bits = weight_bits self.activation_quantize_type = activation_quantize_type + self.weight_quantize_type = weight_quantize_type self.save_out_nodes = save_out_nodes self.save_in_nodes = save_in_nodes @@ -100,7 +104,8 @@ class QuantizationStrategy(Strategy): place=context.place, weight_bits=self.weight_bits, activation_bits=self.activation_bits, - activation_quantize_type=self.activation_quantize_type) + activation_quantize_type=self.activation_quantize_type, + weight_quantize_type=self.weight_quantize_type) transform_pass.apply(train_ir_graph) transform_pass.apply(test_ir_graph) @@ -134,7 +139,8 @@ class QuantizationStrategy(Strategy): scope=context.scope, place=context.place, weight_bits=self.weight_bits, - activation_bits=self.activation_bits) + activation_bits=self.activation_bits, + weight_quantize_type=self.weight_quantize_type) freeze_pass.apply(test_ir_graph) # for other strategies diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml index f29eb53f88..a3a5a724fb 100644 --- a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml +++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml @@ -35,6 +35,8 @@ strategies: start_epoch: 0 end_epoch: 0 float_model_save_path: './output/float' + mobile_model_save_path: './output/mobile' + int8_model_save_path: './output/int8' weight_bits: 8 activation_bits: 8 weight_quantize_type: 'abs_max' diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py index c7feca0b82..e896f8bb42 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py @@ -256,8 +256,6 @@ class TestQuantizationFreezePass(unittest.TestCase): place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) - #transform_pass = QuantizationTransformPass( - # scope=scope, place=place, activation_quantize_type=activation_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) dev_name = '_gpu_' if use_cuda else '_cpu_' @@ -315,7 +313,6 @@ class TestQuantizationFreezePass(unittest.TestCase): # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) - #freeze_pass = QuantizationFreezePass(scope=scope, place=place) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 5ac2b50a99..a209f389f3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -104,14 +104,14 @@ def cuda_places(device_ids=None): :code:`FLAGS_selected_gpus=0,1,2`, the returned list would be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)]. If :code:`FLAGS_selected_gpus` is not set, all visible - gpu places would be returned. + gpu places would be returned. If :code:`device_ids` is not None, it should be the device - ids of gpus. For example, if :code:`device_ids=[0,1,2]`, - the returned list would be + ids of gpus. For example, if :code:`device_ids=[0,1,2]`, + the returned list would be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)]. - - Args: + + Args: device_ids (None|list(int)|tuple(int)): gpu device id list. Returns: @@ -133,11 +133,11 @@ def cuda_places(device_ids=None): def cpu_places(device_count=None): ''' Create a list of :code:`fluid.CPUPlace` objects. - + If :code:`device_count` is None, the device count would - be determined by environment variable :code:`CPU_NUM`. + be determined by environment variable :code:`CPU_NUM`. If :code:`CPU_NUM` is not set, the device count would - be determined by :code:`multiprocessing.cpu_count()`. + be determined by :code:`multiprocessing.cpu_count()`. Args: device_count (None|int): device number. @@ -155,9 +155,9 @@ def cuda_pinned_places(device_count=None): Create a list of :code:`fluid.CUDAPinnedPlace` objects. If :code:`device_count` is None, the device count would - be determined by environment variable :code:`CPU_NUM`. + be determined by environment variable :code:`CPU_NUM`. If :code:`CPU_NUM` is not set, the device count would - be determined by :code:`multiprocessing.cpu_count()`. + be determined by :code:`multiprocessing.cpu_count()`. Args: device_count (None|int): device number. @@ -2164,40 +2164,6 @@ class IrGraph(object): """ return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()} - def _find_var_node(self, key): - """ - Get a variable node by the `key` from this graph. The key - can be a node name or a node id. - - WARNS: - There are some nodes may have the same name. So, be - cautious about using this method when you find the - target var node by its name. - - Args: - key(str|int): The str type denotes that the target variable node's name. - And the int type denotes that the target variable node's id. - - Raises: - ValueError: If this graph doesn't have a variable with the giving name or id. - - Returns: - IrVarNode: the variable node with the giving name or id. - """ - target_var_node = None - var_nodes = self.all_var_nodes() - if isinstance(key, six.string_types): - for var_node in var_nodes: - if var_node.name() == key: - target_var_node = var_node - elif isinstance(key, int): - for var_node in var_nodes: - if var_node.id() == key: - target_var_node = var_node - if target_var_node is None: - raise ValueError("var_node %s not in this graph" % key) - return target_var_node - def create_persistable_node(self, name, var_type, shape, var_dtype): """ Create a persistable variable node in the graph. In IrGraph, @@ -2342,14 +2308,6 @@ class IrGraph(object): core.graph_safe_remove_nodes(self.graph, original_nodes) def resolve_hazard(self): - def _to_node(nodes, node_name): - target_node = None - for n in nodes: - if n.name() == node_name: - target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." - return target_node - ordered_nodes = core.topology_sort(self.graph) var_nodes = dict() for node in ordered_nodes: @@ -2357,16 +2315,17 @@ class IrGraph(object): for each_var_name in node.op().input_arg_names(): if each_var_name not in var_nodes: var_nodes[each_var_name] = [ - _to_node(node.inputs, each_var_name) + self._find_node_by_name(node.inputs, each_var_name) ] for each_var_name in node.op().output_arg_names(): if each_var_name not in var_nodes: var_nodes[each_var_name] = [ - _to_node(node.outputs, each_var_name) + self._find_node_by_name(node.outputs, each_var_name) ] else: var_nodes[each_var_name].append( - _to_node(node.outputs, each_var_name)) + self._find_node_by_name(node.outputs, + each_var_name)) self.graph.resolve_hazard(var_nodes) def has_circle(self): @@ -2479,6 +2438,17 @@ class IrGraph(object): program = Program._construct_from_desc(desc) return program + def _find_node_by_name(self, nodes, node_name): + """ + Find a node in the giving nodes set by the name. + """ + target_node = None + for n in nodes: + if n.name() == node_name: + target_node = n + assert target_node is not None, "Cannot find the target node in the giving set." + return target_node + def _update_desc_attr(self, desc, name, val): """ Update the value of desc's attribute by attribute's name. From 6b854f3e1f4412b5726197bc336754f163148cd8 Mon Sep 17 00:00:00 2001 From: Zhen Wang <wangzhen31@baidu.com> Date: Wed, 27 Mar 2019 17:19:10 +0800 Subject: [PATCH 03/19] fix the save_in_nodes bug. --- .../fluid/contrib/slim/quantization/quantization_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py index da3510de39..aa50891121 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py @@ -158,7 +158,7 @@ class QuantizationStrategy(Strategy): ] if self.save_in_nodes == None: - in_vars = list(context.eval_graph.out_nodes.values()) + in_vars = list(context.eval_graph.in_nodes.values()) else: in_vars = self.save_in_nodes From 63651c1968ac7f5694e8bce3f23be465ad57a895 Mon Sep 17 00:00:00 2001 From: sneaxiy <sneaxiy@126.com> Date: Wed, 27 Mar 2019 10:47:29 +0000 Subject: [PATCH 04/19] fix grad desc maker test=develop --- .../framework/details/reference_count_pass.cc | 1 + paddle/fluid/operators/bpr_loss_op.cc | 20 +- .../operators/controlflow/CMakeLists.txt | 2 +- .../fluid/operators/controlflow/while_op.cc | 21 +- .../operators/controlflow/while_op_helper.cc | 291 ------------------ .../operators/controlflow/while_op_helper.h | 43 --- .../detection/roi_perspective_transform_op.cc | 21 +- .../gaussian_random_batch_size_like_op.cc | 10 +- paddle/fluid/operators/im2sequence_op.cc | 19 +- paddle/fluid/operators/interpolate_op.cc | 34 +- paddle/fluid/operators/l1_norm_op.cc | 19 +- paddle/fluid/operators/label_smooth_op.cc | 24 +- paddle/fluid/operators/linear_chain_crf_op.cc | 39 ++- paddle/fluid/operators/log_loss_op.cc | 20 +- paddle/fluid/operators/lstm_op.cc | 41 ++- paddle/fluid/operators/margin_rank_loss_op.cc | 22 +- paddle/fluid/operators/mean_op.cc | 8 +- paddle/fluid/operators/multiplex_op.cc | 34 +- paddle/fluid/operators/multiplex_op.cu | 11 +- paddle/fluid/operators/multiplex_op.h | 11 +- paddle/fluid/operators/pad_op.cc | 21 +- paddle/fluid/operators/psroi_pool_op.cc | 20 +- paddle/fluid/operators/rank_loss_op.cc | 20 ++ paddle/fluid/operators/recurrent_op.cc | 52 ++-- paddle/fluid/operators/roi_align_op.cc | 20 +- paddle/fluid/operators/roi_pool_op.cc | 21 +- paddle/fluid/operators/scatter_op.cc | 34 +- paddle/fluid/operators/shuffle_channel_op.cc | 20 +- 28 files changed, 473 insertions(+), 426 deletions(-) delete mode 100644 paddle/fluid/operators/controlflow/while_op_helper.cc delete mode 100644 paddle/fluid/operators/controlflow/while_op_helper.h diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 0c3d8d5cae..c218e55b70 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -335,6 +335,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( var_name); ref_cnts[i].emplace(var_name, result.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(result)); + break; } // Seldomly, all preceding trying failed. diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index f349c51d8a..b2dbaecfcf 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/bpr_loss_op.h" +#include <memory> namespace paddle { namespace operators { @@ -127,6 +128,23 @@ neural networks>(https://arxiv.org/abs/1511.06939) )DOC"); } }; + +class BprLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("bpr_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; } // namespace operators } // namespace paddle @@ -134,7 +152,7 @@ namespace ops = paddle::operators; using CPUCtx = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::BprLossGradDescMaker); REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>, ops::BprLossOpKernel<CPUCtx, double>); diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 7aa1c44eaa..4782e9d5ff 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,5 +1,5 @@ include(operators) register_operators(DEPS naive_executor) -cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) +cc_library(loop_op_helper SRCS loop_op_helper.cc DEPS operator) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index deb8ec3bb2..58fe354958 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -18,28 +18,21 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include "paddle/fluid/operators/controlflow/loop_op_helper.h" #include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { namespace operators { +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; + using StepScopeVar = std::vector<framework::Scope *>; using LoDTensor = framework::LoDTensor; -namespace { // NOLINT -static std::string GetSkipEagerDeletionVarsDebugString( - const std::vector<std::string> &vars) { - std::string str = "Skip " + std::to_string(vars.size()) + - " var(s) in eager deletion mode: "; - for (auto &var : vars) { - str.append(var); - str.push_back(' '); - } - return str; -} -} // NOLINT - class WhileOp : public framework::OperatorBase { public: WhileOp(const std::string &type, const framework::VariableNameMap &inputs, diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc deleted file mode 100644 index 2cbd94a061..0000000000 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/controlflow/while_op_helper.h" -#include <string> -#include <unordered_set> -#include <utility> -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace operators { - -// OpVariant is a wrapper class of OpDesc and OperatorBase -// So that API would be the same. -class OpVariant { - struct InputsVisitor - : public boost::static_visitor<const framework::VariableNameMap *> { - template <typename OpType> - const framework::VariableNameMap *operator()(const OpType *op) const { - return &(op->Inputs()); - } - }; - - struct OutputsVisitor - : public boost::static_visitor<const framework::VariableNameMap *> { - template <typename OpType> - const framework::VariableNameMap *operator()(const OpType *op) const { - return &(op->Outputs()); - } - }; - - struct AttributeMapVisitor - : public boost::static_visitor<const framework::AttributeMap *> { - const framework::AttributeMap *operator()( - const framework::OpDesc *op) const { - return &(op->GetAttrMap()); - } - - const framework::AttributeMap *operator()( - const framework::OperatorBase *op) const { - return &(op->Attrs()); - } - }; - - struct RawPointerVisitor : public boost::static_visitor<const void *> { - template <typename OpType> - const void *operator()(const OpType *op) const { - return op; - } - }; - - public: - OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT - - OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT - - const framework::VariableNameMap &Inputs() const { - return *boost::apply_visitor(InputsVisitor(), op_); - } - - const framework::VariableNameMap &Outputs() const { - return *boost::apply_visitor(OutputsVisitor(), op_); - } - - const framework::AttributeMap &Attrs() const { - return *boost::apply_visitor(AttributeMapVisitor(), op_); - } - - template <typename AttrType> - const AttrType &Attr(const std::string &name) const { - auto &attrs = Attrs(); - auto it = attrs.find(name); - PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); - return boost::get<AttrType>(it->second); - } - - bool operator==(const OpVariant &other) const { - return RawPointer() == other.RawPointer(); - } - - const void *RawPointer() const { - return boost::apply_visitor(RawPointerVisitor(), op_); - } - - int which() const { return static_cast<int>(op_.which()); } - - struct Hasher { - size_t operator()(const OpVariant &op) const { - return reinterpret_cast<size_t>(op.RawPointer()); - } - }; - - private: - const boost::variant<const framework::OperatorBase *, - const framework::OpDesc *> - op_; -}; - -static std::string GetDebugString(const std::vector<std::string> &names) { - if (names.empty()) return ""; - std::string ret = names[0]; - for (size_t i = 1; i < names.size(); ++i) { - ret += (" " + names[i]); - } - return ret; -} - -// Set skip variables of while_op and while_grad_op -// These variables should be skipped when eager deletion enables. -// It is because: -// 1. while_grad_op needs some variables defined in while_op. -// 2. while_grad_op needs variables from the previous time step. -static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) { - auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs()); - VLOG(2) << "Prepare to skip " << attr.size() - << " var(s): " << GetDebugString(attr); - attrs[kSkipEagerDeletionVars] = std::move(attr); -} - -// Check whether the forward while_op and while_grad_op match -// The program may have many while_ops. -static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, - const OpVariant &grad_op) { - return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && - fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); -} - -// Test whether the variable is skippable in forward while_op -// The variable is skippable in while_op when the variable used in while_grad -// is not from grad_block. -static bool IsSkippableVar(const std::string &name, - framework::BlockDesc *grad_block) { - return name != framework::kEmptyVarName && !grad_block->HasVar(name); -} - -static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, - const OpVariant &bwd_op) { - auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock); - - // Find all skippable variables in forward while_op - std::unordered_set<std::string> forward_skip_vars; - for (auto *op_desc : grad_block->AllOps()) { - for (auto &in_arg_name : op_desc->InputArgumentNames()) { - if (IsSkippableVar(in_arg_name, grad_block)) { - forward_skip_vars.insert(in_arg_name); - } - } - - for (auto &out_arg_name : op_desc->OutputArgumentNames()) { - if (IsSkippableVar(out_arg_name, grad_block)) { - forward_skip_vars.insert(out_arg_name); - } - } - } - - SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(), - forward_skip_vars.end())); - - // Find all skippable variables in while_grad_op - // The skipped variables are those which would be used across time steps. - auto &fwd_input = fwd_op.Inputs().at(kX); - auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); - PADDLE_ENFORCE_EQ( - fwd_input.size(), in_grads.size(), - "Backward input gradient number does not match forward input number."); - - std::unordered_set<std::string> backward_skip_vars; - for (size_t i = 0; i < in_grads.size(); ++i) { - if (in_grads[i] == framework::kEmptyVarName) { - continue; - } - backward_skip_vars.insert(in_grads[i]); - backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); - } - - SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(), - backward_skip_vars.end())); -} - -// Find all while_ops and while_grad_ops in the graph or program -// The while_grad_op and while_op may located in different blocks -// So we should traverse all blocks in the program and find them out. -static void FindAllWhileAndWhileGradOp(std::vector<OpVariant> *while_ops, - std::vector<OpVariant> *while_grad_ops) { - PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); - - if (while_ops->empty()) return; - - const auto *program = - while_ops->front().Attr<framework::BlockDesc *>(kStepBlock)->Program(); - for (size_t i = 1; i < program->Size(); ++i) { - auto &block = program->Block(i); - for (size_t j = 0; j < block.OpSize(); ++j) { - auto *op = block.Op(j); - if (op->Type() == "while") { - while_ops->emplace_back(op); - } else if (op->Type() == "while_grad") { - while_grad_ops->emplace_back(op); - } - } - } - - PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), - "There are extra while_grad ops in the graph or program"); -} - -static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( - std::vector<OpVariant> *while_ops, std::vector<OpVariant> *while_grad_ops) { - FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); - - VLOG(2) << "Found while op num: " << while_ops->size() - << ", while grad op num: " << while_grad_ops->size(); - - if (while_grad_ops->empty()) { - return; - } - - std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set( - while_ops->begin(), while_ops->end()); - - for (auto &bwd_op : *while_grad_ops) { - const OpVariant *matched_fwd_op = nullptr; - for (auto &fwd_op : while_op_set) { - if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { - PADDLE_ENFORCE(matched_fwd_op == nullptr, - "Found multiple matched while ops"); - matched_fwd_op = &fwd_op; - } - } - PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, - "Cannot find matched forward while op."); - ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); - while_op_set.erase(*matched_fwd_op); - } -} - -void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( - int block_id, - const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) { - // If block_id is not 0, returns - // This is because all while_ops and while_grad_ops in the whole program - // would be processed when block_id is 0 (i.e. when Executor::Run() or - // ParallelExecutor constructs). - - // What's more, all while_ops and while_grad_ops must be processed when - // block_id is zero. If not, while_op may run first and erase variables - // used in while_grad_op, and in this moment, while_grad_ops may be not - // constructed yet. - if (block_id != 0) return; - - std::vector<OpVariant> fwd_ops, bwd_ops; - for (auto &op : all_ops) { - if (op->Type() == "while") { - fwd_ops.emplace_back(op.get()); - } else if (op->Type() == "while_grad") { - bwd_ops.emplace_back(op.get()); - } - } - PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); -} - -void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( - const std::vector<framework::OperatorBase *> &while_ops, - const std::vector<framework::OperatorBase *> &while_grad_ops) { - std::vector<OpVariant> fwd_ops, bwd_ops; - fwd_ops.reserve(while_ops.size()); - for (auto *op : while_ops) { - fwd_ops.emplace_back(op); - } - - bwd_ops.reserve(while_grad_ops.size()); - for (auto *op : while_grad_ops) { - bwd_ops.emplace_back(op); - } - - PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h deleted file mode 100644 index 456ba8642b..0000000000 --- a/paddle/fluid/operators/controlflow/while_op_helper.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include <memory> -#include <string> -#include <vector> -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/variant.h" - -namespace paddle { -namespace operators { - -static constexpr char kStepBlock[] = "sub_block"; -static constexpr char kCondition[] = "Condition"; -static constexpr char kStepScopes[] = "StepScopes"; -static constexpr char kX[] = "X"; -static constexpr char kXGRAD[] = "X@GRAD"; -static constexpr char kOutputs[] = "Out"; -static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; - -void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( - int block_id, - const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops); - -void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( - const std::vector<framework::OperatorBase *> &while_ops, - const std::vector<framework::OperatorBase *> &while_grad_ops); - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index a97828e6fe..5b84221cfa 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include <algorithm> +#include <memory> #include <vector> #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" @@ -568,13 +569,31 @@ class ROIPerspectiveTransformOpMaker } }; +class ROIPerspectiveTransformGradDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("roi_perspective_transform_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp, ops::ROIPerspectiveTransformOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::ROIPerspectiveTransformGradDescMaker); REGISTER_OPERATOR(roi_perspective_transform_grad, ops::ROIPerspectiveTransformGradOp); REGISTER_OP_CPU_KERNEL(roi_perspective_transform, diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc index 4a97428148..98ebe1fdf4 100644 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc @@ -65,11 +65,17 @@ by input arguments. } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input"); + } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT( +REGISTER_OPERATOR( gaussian_random_batch_size_like, paddle::operators::GaussianRandomBatchSizeLikeOp, - paddle::operators::GaussianRandomBatchSizeLikeOpMaker); + paddle::operators::GaussianRandomBatchSizeLikeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference); + // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 8efd43928a..44fd95edef 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/im2sequence_op.h" +#include <memory> #include <string> #include <vector> @@ -146,12 +147,28 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel { } }; +class Im2SequenceGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("im2sequence_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::Im2SequenceGradDescMaker); REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp); REGISTER_OP_CPU_KERNEL( im2sequence, diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 10d01af982..cfded65f0b 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -194,21 +194,43 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), - ctx.GetPlace()); + return framework::OpKernelType( + ctx.Input<Tensor>(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); } }; +class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType(ForwardOp().Type() + "_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(InterpolateGradNoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad); + ops::InterpolateGradDescMaker); +REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad, + ops::InterpolateGradNoNeedBufferVarsInference); REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad); + ops::InterpolateGradDescMaker); +REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad, + ops::InterpolateGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>, ops::InterpolateKernel<double>, ops::InterpolateKernel<uint8_t>); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index bc115090ac..2696d0bef9 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/l1_norm_op.h" +#include <memory> namespace paddle { namespace operators { @@ -62,12 +63,28 @@ $$Out = \sum{|X|}$$ } }; +class L1NormGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("l1_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::L1NormGradDescMaker); REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp); REGISTER_OP_CPU_KERNEL( l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc index da59bd53bc..6d0af57318 100644 --- a/paddle/fluid/operators/label_smooth_op.cc +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/label_smooth_op.h" +#include <memory> #include <string> namespace paddle { @@ -105,10 +106,23 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + } +}; + +class LabelSmoothGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("label_smooth_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; @@ -117,7 +131,7 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::LabelSmoothGradDescMaker); REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp); REGISTER_OP_CPU_KERNEL( label_smooth, diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index e17b6cb598..fa09cb61e6 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/linear_chain_crf_op.h" +#include <memory> namespace paddle { namespace operators { @@ -250,14 +251,46 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { } }; +class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("linear_chain_crf_grad"); + op->SetAttrMap(Attrs()); + + op->SetInput("Emission", Input("Emission")); + op->SetInput("Transition", Input("Transition")); + op->SetInput("Label", Input("Label")); + + op->SetInput("Alpha", Output("Alpha")); + op->SetInput("EmissionExps", Output("EmissionExps")); + op->SetInput("TransitionExps", Output("TransitionExps")); + + op->SetInput(framework::GradVarName("LogLikelihood"), + OutputGrad("LogLikelihood")); + + op->SetOutput(framework::GradVarName("Emission"), InputGrad("Emission")); + op->SetOutput(framework::GradVarName("Transition"), + InputGrad("Transition")); + + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + LinearChainCRFGradNoNeedBufferVarsInference, "Transition", "Emission"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp, - ops::LinearChainCRFOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp); + ops::LinearChainCRFOpMaker, ops::LinearChainCRFGradDescMaker); +REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp, + ops::LinearChainCRFGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( linear_chain_crf, ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>, diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index ef1fb83aa6..e8850a1e58 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/log_loss_op.h" +#include <memory> namespace paddle { namespace operators { @@ -100,12 +101,29 @@ class LogLossGradOp : public framework::OperatorWithKernel { } }; +class LogLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("log_loss_grad"); + op->SetInput("Predicted", Input("Predicted")); + op->SetInput("Labels", Input("Labels")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetOutput(framework::GradVarName("Predicted"), InputGrad("Predicted")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::LogLossGradDescMaker); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); REGISTER_OP_CPU_KERNEL( log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 4a199d681f..30c3945cbb 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -264,12 +264,51 @@ class LSTMGradOp : public framework::OperatorWithKernel { } }; +class LSTMGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("lstm_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("Input", Input("Input")); + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + + if (ForwardOp().Inputs().count("H0") > 0) { + op->SetInput("H0", Input("H0")); + op->SetOutput(framework::GradVarName("H0"), InputGrad("H0")); + } + + if (ForwardOp().Inputs().count("C0") > 0) { + op->SetInput("C0", Input("C0")); + op->SetOutput(framework::GradVarName("C0"), InputGrad("C0")); + } + + op->SetInput("Weight", Input("Weight")); + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + + op->SetInput("Bias", Input("Bias")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + op->SetInput("Cell", Output("Cell")); + + op->SetInput("Hidden", Output("Hidden")); + op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden")); + + op->SetInput("BatchGate", Output("BatchGate")); + op->SetInput("BatchCellPreAct", Output("BatchCellPreAct")); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::LSTMGradOpDescMaker); REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp); REGISTER_OP_CPU_KERNEL( lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>, diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc index b643ba9d7f..b3d9733a97 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cc +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -94,8 +94,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("Activated"), @@ -106,13 +104,31 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { } }; +class MarginRankLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("margin_rank_loss_grad"); + op->SetInput("Activated", Output("Activated")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Label", Input("Label")); + op->SetOutput(framework::GradVarName("X1"), InputGrad("X1")); + op->SetOutput(framework::GradVarName("X2"), InputGrad("X2")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp, ops::MarginRankLossOpMaker<float>, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::MarginRankLossGradDescMaker); REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp); REGISTER_OP_CPU_KERNEL( margin_rank_loss, diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 35b6d7b5e3..26d86afed0 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -61,7 +61,8 @@ class MeanGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = ctx.Input<Tensor>("X")->type(); + auto input_data_type = + ctx.Input<Tensor>(framework::GradVarName("Out"))->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -81,13 +82,16 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(MeanGradNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType, ops::MeanGradMaker); -REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); +REGISTER_OPERATOR(mean_grad, ops::MeanGradOp, + ops::MeanGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>, ops::MeanKernel<paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 1801f2915e..b3d0423b72 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/multiplex_op.h" +#include <vector> namespace paddle { namespace operators { @@ -111,28 +112,47 @@ class MultiplexGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null."); - PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(), - "Output(X@Grad) should not be null."); + auto& dxs = ctx->Outputs(framework::GradVarName("X")); + PADDLE_ENFORCE(!dxs.empty(), "Output(X@Grad) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); - ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputsDim(framework::GradVarName("X"), + std::vector<framework::DDim>(dxs.size(), dout_dim)); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input<Tensor>(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +class MultiplexGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("multiplex_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; } }; } // namespace operators } // namespace paddle + namespace ops = paddle::operators; REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, - paddle::framework::DefaultGradOpDescMaker<false>); + ops::MultiplexGradDescMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( multiplex, diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu index 2f8a602f3c..1ef54ecc73 100644 --- a/paddle/fluid/operators/multiplex_op.cu +++ b/paddle/fluid/operators/multiplex_op.cu @@ -53,20 +53,25 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { public: void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out")); - auto ins = ctx.MultiInput<Tensor>("X"); auto* ids = ctx.Input<Tensor>("Ids"); auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X")); + + size_t idx = -1UL; for (size_t i = 0; i < d_ins.size(); i++) { if (d_ins[i]) { d_ins[i]->mutable_data<T>(ctx.GetPlace()); auto t = framework::EigenVector<T>::Flatten(*d_ins[i]); t.device(*ctx.template device_context<Place>().eigen_device()) = t.constant(static_cast<T>(0)); + + idx = i; } } - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; + if (idx == -1UL) return; + + auto rows = d_ins[idx]->dims()[0]; + auto cols = d_ins[idx]->numel() / rows; // copy index to cpu Tensor index_t_cpu; TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h index 87de000971..44d6cc84a6 100644 --- a/paddle/fluid/operators/multiplex_op.h +++ b/paddle/fluid/operators/multiplex_op.h @@ -52,20 +52,25 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> { void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto* ids = ctx.Input<framework::Tensor>("Ids"); - auto ins = ctx.MultiInput<framework::Tensor>("X"); auto d_ins = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X")); + + size_t idx = -1UL; for (size_t i = 0; i < d_ins.size(); i++) { if (d_ins[i]) { d_ins[i]->mutable_data<T>(ctx.GetPlace()); auto t = framework::EigenVector<T>::Flatten(*d_ins[i]); t.device(*ctx.template device_context<DeviceContext>().eigen_device()) = t.constant(static_cast<T>(0)); + + idx = i; } } - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; + if (idx == -1UL) return; + + auto rows = d_ins[idx]->dims()[0]; + auto cols = d_ins[idx]->numel() / rows; auto* index = ids->data<int32_t>(); platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index d4b631a6f5..c28106d312 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pad_op.h" +#include <memory> namespace paddle { namespace operators { @@ -29,7 +30,7 @@ class PadOp : public framework::OperatorWithKernel { "Output(Out) of PadOp should not be null."); auto x_dim = ctx->GetInputDim("X"); - auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); + auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()), "Size of paddings should be equal to 2 * dimension size " "of input tensor."); @@ -99,13 +100,20 @@ class PadOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - auto x_dims = ctx->GetInputDim("X"); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); + for (int i = 0; i < dout_dims.size(); ++i) { + dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]); + } + auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); + for (int i = 0; i < dout_dims.size(); ++i) { + dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]); + } + ctx->SetOutputDim(x_grad_name, dout_dims); } } }; @@ -117,7 +125,6 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker { protected: std::unique_ptr<framework::OpDesc> Apply() const override { auto* bind = new framework::OpDesc(); - bind->SetInput("X", Input("X")); bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index 78989582b7..dce9108eb1 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/psroi_pool_op.h" +#include <memory> namespace paddle { namespace operators { @@ -154,12 +155,29 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { } }; +class PSROIPoolGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("psroi_pool_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::PSROIPoolGradDescMaker); REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); REGISTER_OP_CPU_KERNEL( psroi_pool, diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 313cf01541..45daa6b955 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/rank_loss_op.h" +#include <memory> #include <string> namespace paddle { @@ -116,6 +117,25 @@ class RankLossGradOp : public framework::OperatorWithKernel { } }; +class RankLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("rank_loss_grad"); + op->SetInput("Label", Input("Label")); + op->SetInput("Left", Input("Left")); + op->SetInput("Right", Input("Right")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Left"), InputGrad("Left")); + op->SetOutput(framework::GradVarName("Right"), InputGrad("Right")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 2898a62ddb..45c87bb085 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -15,24 +15,24 @@ limitations under the License. */ #include <vector> #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/controlflow/loop_op_helper.h" namespace paddle { namespace operators { -constexpr char kInputs[] = "inputs"; -constexpr char kInitialStates[] = "initial_states"; -constexpr char kParameters[] = "parameters"; -constexpr char kOutputs[] = "outputs"; -constexpr char kStepScopes[] = "step_scopes"; -constexpr char kExStates[] = "ex_states"; -constexpr char kStates[] = "states"; -constexpr char kStepBlock[] = "sub_block"; -constexpr char kReverse[] = "reverse"; -constexpr char kIsTrain[] = "is_train"; -#define GRAD_SUFFIX "@GRAD" -constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX; -constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX; -constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX; -constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX; + +using recurrent::kInputs; +using recurrent::kInitialStates; +using recurrent::kParameters; +using recurrent::kOutputs; +using recurrent::kStepScopes; +using recurrent::kExStates; +using recurrent::kStates; +using recurrent::kReverse; +using recurrent::kIsTrain; +using recurrent::kInputGrads; +using recurrent::kOutputGrads; +using recurrent::kParamGrads; +using recurrent::kInitStateGrads; using StepScopeVar = std::vector<framework::Scope *>; @@ -249,6 +249,9 @@ class RecurrentOp : public RecurrentBase { framework::Executor executor(place); auto *block = Attr<framework::BlockDesc *>(kStepBlock); + auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars); + VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars); + auto *program = block->Program(); for (size_t i = 0; i < seq_len; ++i) { @@ -283,8 +286,7 @@ class RecurrentOp : public RecurrentBase { // Every inputs are linked now, execute! executor.Run(*program, &cur_scope, block->ID(), false /*create_local_scope*/, true /*create_vars*/, - std::vector<std::string>() /*skip_ref_cnt_vars*/, - true /*force_disable_gc*/); + keep_vars); // get device context from pool platform::DeviceContextPool &pool = @@ -341,6 +343,9 @@ class RecurrentGradOp : public RecurrentBase { auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *program = block->Program(); + auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars); + + VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); @@ -401,8 +406,7 @@ class RecurrentGradOp : public RecurrentBase { // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), false /*create_local_scope*/, true /*create_vars*/, - std::vector<std::string>() /*skip_ref_cnt_vars*/, - true /*force_disable_gc*/); + keep_vars); VLOG(5) << "executor.Run finished "; @@ -579,6 +583,10 @@ if reverse is True o o o o )DOC").SetDefault(false); AddAttr<bool>(kIsTrain, "").SetDefault(true); + AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars, + "Skip vars that would " + "be used in backward ops") + .SetDefault(std::vector<std::string>()); AddComment(R"DOC( Static Length Recurrent Operator. @@ -614,7 +622,11 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { this->OutputGrad(output_param)); } } - grad->SetAttrMap(this->Attrs()); + + auto attrs = this->Attrs(); + attrs.insert({kSkipEagerDeletionVars, std::vector<std::string>()}); + grad->SetAttrMap(attrs); + grad->SetBlockAttr(kStepBlock, grad_block_[0]); return std::unique_ptr<framework::OpDesc>(grad); diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 6857b5ed9d..7bb10ce063 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/roi_align_op.h" +#include <memory> namespace paddle { namespace operators { @@ -147,12 +148,29 @@ Thus avoid the misaligned problem. } }; +class ROIAlignGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("roi_align_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::ROIAlignGradDescMaker); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp); REGISTER_OP_CPU_KERNEL( roi_align, diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index e46d92d6fc..cfac7e09e1 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/roi_pool_op.h" +#include <memory> namespace paddle { namespace operators { @@ -158,12 +159,30 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn } }; +class ROIPoolGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("roi_pool_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput("Argmax", Output("Argmax")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::ROIPoolGradDescMaker); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_CPU_KERNEL( roi_pool, diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index ad418d51bc..1c26707500 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -63,14 +63,16 @@ class ScatterGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input<Tensor>(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -95,12 +97,34 @@ $$ } }; +class ScatterGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("scatter_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput("Updates", Input("Updates")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterGradNoNeedBufferVarsInference, + "Updates"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); -REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp); + ops::ScatterGradDescMaker); +REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp, + ops::ScatterGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>); REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>); diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 9349912e09..26355e5861 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" +#include <memory> namespace paddle { namespace operators { @@ -91,13 +92,28 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { } }; +class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<framework::OpDesc> Apply() const override { + std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); + op->SetType("shuffle_channel_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, - ops::ShuffleChannelOpMaker, - paddle::framework::DefaultGradOpDescMaker<true>); + ops::ShuffleChannelOpMaker, ops::ShuffleChannelGradDescMaker); REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); From c4c6205268d7863b714334dcdcdd31e1576e540d Mon Sep 17 00:00:00 2001 From: sneaxiy <sneaxiy@126.com> Date: Wed, 27 Mar 2019 21:18:48 +0800 Subject: [PATCH 05/19] fix gc bug test=develop --- paddle/fluid/framework/details/reference_count_pass.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 0c3d8d5cae..c218e55b70 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -335,6 +335,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( var_name); ref_cnts[i].emplace(var_name, result.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(result)); + break; } // Seldomly, all preceding trying failed. From 4c8254e3bf426a044ef51d661193bd9a720dc204 Mon Sep 17 00:00:00 2001 From: sneaxiy <sneaxiy@126.com> Date: Wed, 27 Mar 2019 10:53:01 +0000 Subject: [PATCH 06/19] revert some loop op revision test=develop --- .../operators/controlflow/CMakeLists.txt | 2 +- .../fluid/operators/controlflow/while_op.cc | 21 +- .../operators/controlflow/while_op_helper.cc | 291 ++++++++++++++++++ .../operators/controlflow/while_op_helper.h | 43 +++ paddle/fluid/operators/interpolate_op.cc | 4 + paddle/fluid/operators/lstm_op.cc | 1 + paddle/fluid/operators/margin_rank_loss_op.cc | 1 + paddle/fluid/operators/mean_op.cc | 3 + paddle/fluid/operators/multiplex_op.cc | 1 + paddle/fluid/operators/recurrent_op.cc | 52 ++-- paddle/fluid/operators/scatter_op.cc | 1 + 11 files changed, 380 insertions(+), 40 deletions(-) create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.cc create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.h diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 4782e9d5ff..7aa1c44eaa 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,5 +1,5 @@ include(operators) register_operators(DEPS naive_executor) -cc_library(loop_op_helper SRCS loop_op_helper.cc DEPS operator) +cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index a07a732d88..b321920882 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -18,21 +18,28 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/controlflow/loop_op_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { namespace operators { -static constexpr char kCondition[] = "Condition"; -static constexpr char kStepScopes[] = "StepScopes"; -static constexpr char kX[] = "X"; -static constexpr char kXGRAD[] = "X@GRAD"; -static constexpr char kOutputs[] = "Out"; - using StepScopeVar = std::vector<framework::Scope *>; using LoDTensor = framework::LoDTensor; +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector<std::string> &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT + class WhileOp : public framework::OperatorBase { public: WhileOp(const std::string &type, const framework::VariableNameMap &inputs, diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc new file mode 100644 index 0000000000..2cbd94a061 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -0,0 +1,291 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include <string> +#include <unordered_set> +#include <utility> +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace operators { + +// OpVariant is a wrapper class of OpDesc and OperatorBase +// So that API would be the same. +class OpVariant { + struct InputsVisitor + : public boost::static_visitor<const framework::VariableNameMap *> { + template <typename OpType> + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Inputs()); + } + }; + + struct OutputsVisitor + : public boost::static_visitor<const framework::VariableNameMap *> { + template <typename OpType> + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Outputs()); + } + }; + + struct AttributeMapVisitor + : public boost::static_visitor<const framework::AttributeMap *> { + const framework::AttributeMap *operator()( + const framework::OpDesc *op) const { + return &(op->GetAttrMap()); + } + + const framework::AttributeMap *operator()( + const framework::OperatorBase *op) const { + return &(op->Attrs()); + } + }; + + struct RawPointerVisitor : public boost::static_visitor<const void *> { + template <typename OpType> + const void *operator()(const OpType *op) const { + return op; + } + }; + + public: + OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT + + OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT + + const framework::VariableNameMap &Inputs() const { + return *boost::apply_visitor(InputsVisitor(), op_); + } + + const framework::VariableNameMap &Outputs() const { + return *boost::apply_visitor(OutputsVisitor(), op_); + } + + const framework::AttributeMap &Attrs() const { + return *boost::apply_visitor(AttributeMapVisitor(), op_); + } + + template <typename AttrType> + const AttrType &Attr(const std::string &name) const { + auto &attrs = Attrs(); + auto it = attrs.find(name); + PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + return boost::get<AttrType>(it->second); + } + + bool operator==(const OpVariant &other) const { + return RawPointer() == other.RawPointer(); + } + + const void *RawPointer() const { + return boost::apply_visitor(RawPointerVisitor(), op_); + } + + int which() const { return static_cast<int>(op_.which()); } + + struct Hasher { + size_t operator()(const OpVariant &op) const { + return reinterpret_cast<size_t>(op.RawPointer()); + } + }; + + private: + const boost::variant<const framework::OperatorBase *, + const framework::OpDesc *> + op_; +}; + +static std::string GetDebugString(const std::vector<std::string> &names) { + if (names.empty()) return ""; + std::string ret = names[0]; + for (size_t i = 1; i < names.size(); ++i) { + ret += (" " + names[i]); + } + return ret; +} + +// Set skip variables of while_op and while_grad_op +// These variables should be skipped when eager deletion enables. +// It is because: +// 1. while_grad_op needs some variables defined in while_op. +// 2. while_grad_op needs variables from the previous time step. +static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) { + auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs()); + VLOG(2) << "Prepare to skip " << attr.size() + << " var(s): " << GetDebugString(attr); + attrs[kSkipEagerDeletionVars] = std::move(attr); +} + +// Check whether the forward while_op and while_grad_op match +// The program may have many while_ops. +static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && + fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); +} + +// Test whether the variable is skippable in forward while_op +// The variable is skippable in while_op when the variable used in while_grad +// is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, + const OpVariant &bwd_op) { + auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock); + + // Find all skippable variables in forward while_op + std::unordered_set<std::string> forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(), + forward_skip_vars.end())); + + // Find all skippable variables in while_grad_op + // The skipped variables are those which would be used across time steps. + auto &fwd_input = fwd_op.Inputs().at(kX); + auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + + std::unordered_set<std::string> backward_skip_vars; + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + backward_skip_vars.insert(in_grads[i]); + backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(), + backward_skip_vars.end())); +} + +// Find all while_ops and while_grad_ops in the graph or program +// The while_grad_op and while_op may located in different blocks +// So we should traverse all blocks in the program and find them out. +static void FindAllWhileAndWhileGradOp(std::vector<OpVariant> *while_ops, + std::vector<OpVariant> *while_grad_ops) { + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); + + if (while_ops->empty()) return; + + const auto *program = + while_ops->front().Attr<framework::BlockDesc *>(kStepBlock)->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "while") { + while_ops->emplace_back(op); + } else if (op->Type() == "while_grad") { + while_grad_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), + "There are extra while_grad ops in the graph or program"); +} + +static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( + std::vector<OpVariant> *while_ops, std::vector<OpVariant> *while_grad_ops) { + FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + + VLOG(2) << "Found while op num: " << while_ops->size() + << ", while grad op num: " << while_grad_ops->size(); + + if (while_grad_ops->empty()) { + return; + } + + std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set( + while_ops->begin(), while_ops->end()); + + for (auto &bwd_op : *while_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : while_op_set) { + if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched while ops"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward while op."); + ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); + while_op_set.erase(*matched_fwd_op); + } +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) { + // If block_id is not 0, returns + // This is because all while_ops and while_grad_ops in the whole program + // would be processed when block_id is 0 (i.e. when Executor::Run() or + // ParallelExecutor constructs). + + // What's more, all while_ops and while_grad_ops must be processed when + // block_id is zero. If not, while_op may run first and erase variables + // used in while_grad_op, and in this moment, while_grad_ops may be not + // constructed yet. + if (block_id != 0) return; + + std::vector<OpVariant> fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "while") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "while_grad") { + bwd_ops.emplace_back(op.get()); + } + } + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector<framework::OperatorBase *> &while_ops, + const std::vector<framework::OperatorBase *> &while_grad_ops) { + std::vector<OpVariant> fwd_ops, bwd_ops; + fwd_ops.reserve(while_ops.size()); + for (auto *op : while_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(while_grad_ops.size()); + for (auto *op : while_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h new file mode 100644 index 0000000000..456ba8642b --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <memory> +#include <string> +#include <vector> +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops); + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector<framework::OperatorBase *> &while_ops, + const std::vector<framework::OperatorBase *> &while_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index cfded65f0b..edee8c08d0 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -10,6 +10,7 @@ limitations under the License. */ #include "paddle/fluid/operators/interpolate_op.h" +#include <memory> #include <string> #include <vector> #include "paddle/fluid/framework/op_registry.h" @@ -209,6 +210,9 @@ class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker { std::unique_ptr<framework::OpDesc> op(new framework::OpDesc()); op->SetType(ForwardOp().Type() + "_grad"); op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("OutSize") > 0) { + op->SetInput("OutSize", Input("OutSize")); + } op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 30c3945cbb..52e4e8be28 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lstm_op.h" +#include <memory> #include <string> namespace paddle { diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc index b3d9733a97..fca3532551 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cc +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/margin_rank_loss_op.h" +#include <memory> namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 26d86afed0..2b2f845076 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mean_op.h" +#include <memory> #include <string> +#include <unordered_map> + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index b3d0423b72..7cb213e899 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/multiplex_op.h" +#include <memory> #include <vector> namespace paddle { diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 45c87bb085..2898a62ddb 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -15,24 +15,24 @@ limitations under the License. */ #include <vector> #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/controlflow/loop_op_helper.h" namespace paddle { namespace operators { - -using recurrent::kInputs; -using recurrent::kInitialStates; -using recurrent::kParameters; -using recurrent::kOutputs; -using recurrent::kStepScopes; -using recurrent::kExStates; -using recurrent::kStates; -using recurrent::kReverse; -using recurrent::kIsTrain; -using recurrent::kInputGrads; -using recurrent::kOutputGrads; -using recurrent::kParamGrads; -using recurrent::kInitStateGrads; +constexpr char kInputs[] = "inputs"; +constexpr char kInitialStates[] = "initial_states"; +constexpr char kParameters[] = "parameters"; +constexpr char kOutputs[] = "outputs"; +constexpr char kStepScopes[] = "step_scopes"; +constexpr char kExStates[] = "ex_states"; +constexpr char kStates[] = "states"; +constexpr char kStepBlock[] = "sub_block"; +constexpr char kReverse[] = "reverse"; +constexpr char kIsTrain[] = "is_train"; +#define GRAD_SUFFIX "@GRAD" +constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX; +constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX; +constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX; +constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX; using StepScopeVar = std::vector<framework::Scope *>; @@ -249,9 +249,6 @@ class RecurrentOp : public RecurrentBase { framework::Executor executor(place); auto *block = Attr<framework::BlockDesc *>(kStepBlock); - auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars); - VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars); - auto *program = block->Program(); for (size_t i = 0; i < seq_len; ++i) { @@ -286,7 +283,8 @@ class RecurrentOp : public RecurrentBase { // Every inputs are linked now, execute! executor.Run(*program, &cur_scope, block->ID(), false /*create_local_scope*/, true /*create_vars*/, - keep_vars); + std::vector<std::string>() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); // get device context from pool platform::DeviceContextPool &pool = @@ -343,9 +341,6 @@ class RecurrentGradOp : public RecurrentBase { auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *program = block->Program(); - auto &keep_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars); - - VLOG(2) << GetSkipEagerDeletionVarsDebugString(keep_vars); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); @@ -406,7 +401,8 @@ class RecurrentGradOp : public RecurrentBase { // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), false /*create_local_scope*/, true /*create_vars*/, - keep_vars); + std::vector<std::string>() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); VLOG(5) << "executor.Run finished "; @@ -583,10 +579,6 @@ if reverse is True o o o o )DOC").SetDefault(false); AddAttr<bool>(kIsTrain, "").SetDefault(true); - AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars, - "Skip vars that would " - "be used in backward ops") - .SetDefault(std::vector<std::string>()); AddComment(R"DOC( Static Length Recurrent Operator. @@ -622,11 +614,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { this->OutputGrad(output_param)); } } - - auto attrs = this->Attrs(); - attrs.insert({kSkipEagerDeletionVars, std::vector<std::string>()}); - grad->SetAttrMap(attrs); - + grad->SetAttrMap(this->Attrs()); grad->SetBlockAttr(kStepBlock, grad_block_[0]); return std::unique_ptr<framework::OpDesc>(grad); diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 1c26707500..8e0e3bd605 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/scatter_op.h" +#include <memory> #include "paddle/fluid/framework/ddim.h" namespace paddle { From 09dfc7a2aa9296d820cead49a2e125aea5e72ae8 Mon Sep 17 00:00:00 2001 From: Wojciech Uss <wojciech.uss@intel.com> Date: Wed, 27 Mar 2019 20:13:01 +0100 Subject: [PATCH 07/19] C-API quantization core 2 (#16396) * C-API quantization core test=develop Co-authored-by: Sylwester Fraczek <sylwester.fraczek@intel.com> * Decouple Quantizer from AnalysisPredictor test=develop * fixes after review test=develop * renamed mkldnn quantize stuff test=develop * remove ifdef from header file test=develop --- paddle/fluid/inference/CMakeLists.txt | 14 +- paddle/fluid/inference/api/CMakeLists.txt | 12 +- paddle/fluid/inference/api/analysis_config.cc | 52 ++- .../fluid/inference/api/analysis_predictor.cc | 56 ++- .../fluid/inference/api/analysis_predictor.h | 13 + .../api/analysis_predictor_tester.cc | 241 ++++++++++ .../fluid/inference/api/mkldnn_quantizer.cc | 437 ++++++++++++++++++ paddle/fluid/inference/api/mkldnn_quantizer.h | 104 +++++ .../inference/api/mkldnn_quantizer_config.cc | 40 ++ .../inference/api/paddle_analysis_config.h | 18 + .../api/paddle_mkldnn_quantizer_config.h | 105 +++++ .../inference/api/paddle_pass_builder.cc | 4 +- .../fluid/inference/api/paddle_pass_builder.h | 22 +- 13 files changed, 1089 insertions(+), 29 deletions(-) create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer.cc create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer.h create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer_config.cc create mode 100644 paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 4cd29486a8..5e0be5d445 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -37,18 +37,24 @@ endif(WIN32) add_subdirectory(api) +if(WITH_MKLDNN) + set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) +endif() + set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor) set(SHARED_INFERENCE_SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc + ${mkldnn_quantizer_src} ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} - zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) + zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) endif(WIN32) if(NOT APPLE) @@ -61,11 +67,11 @@ endif() if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) endif() get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 90f09505c0..882bb34683 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -33,13 +33,19 @@ endif() add_subdirectory(details) -cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) +if(WITH_MKLDNN) + set(mkldnn_quantizer_src mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) + cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) +endif() + +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor +cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps}) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config - analysis_config paddle_pass_builder zero_copy_tensor + paddle_pass_builder zero_copy_tensor reset_tensor_array) cc_test(test_paddle_inference_api diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 7bfdada496..aee94e1234 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -108,6 +108,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); + // Quantization related. + CP_MEMBER(use_mkldnn_quantizer_); + CP_MEMBER(mkldnn_quantizer_config_); CP_MEMBER(use_anakin_); CP_MEMBER(anakin_max_batchsize_); @@ -148,6 +151,26 @@ void AnalysisConfig::EnableMKLDNN() { Update(); } +void AnalysisConfig::EnableMkldnnQuantizer() { +#ifdef PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_config_) + mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig()); + use_mkldnn_quantizer_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + use_mkldnn_quantizer_ = false; +#endif + + Update(); +} + +std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config() + const { + PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, + "MkldnnQuantizer was not enabled yet."); + return mkldnn_quantizer_config_; +} + void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, AnalysisConfig::Precision precision_mode, bool use_static) { @@ -224,15 +247,27 @@ void AnalysisConfig::Update() { #endif } - if (enable_memory_optim_) { - auto analysis_passes = pass_builder()->AnalysisPasses(); - auto memory_opti_pass_name = "memory_optimize_pass"; - bool already_exists = - std::find(analysis_passes.begin(), analysis_passes.end(), - memory_opti_pass_name) != analysis_passes.end(); - if (!already_exists) { - pass_builder()->AppendAnalysisPass(memory_opti_pass_name); + // Quantization passes must come after all other optimization passes + if (use_mkldnn_quantizer_) { + if (!enable_ir_optim_) { + LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization " + "is enabled."; } +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMkldnnQuantizer(); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + use_mkldnn_quantizer_ = false; +#endif + } + +#ifdef PADDLE_WITH_MKLDNN + // Do not optimize before quantization + if (enable_memory_optim_ && !use_mkldnn_quantizer_) { +#else + if (enable_memory_optim_) { +#endif + pass_builder()->AppendAnalysisPass("memory_optimize_pass"); } if (use_anakin_) { @@ -277,6 +312,7 @@ std::string AnalysisConfig::SerializeInfoCache() { for (auto &item : mkldnn_enabled_op_types_) ss << item; ss << ";"; + ss << use_mkldnn_quantizer_; ss << model_from_memory_; ss << enable_ir_optim_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 001e8e66d5..f726056154 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -18,6 +18,7 @@ #include <fstream> #include <memory> #include <string> +#include <utility> #include <vector> #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" @@ -35,8 +36,13 @@ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif + #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" @@ -341,10 +347,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, return true; } -// NOTE All the members in AnalysisConfig should be copied to Argument. -void AnalysisPredictor::OptimizeInferenceProgram() { - status_program_optimized_ = true; - +void AnalysisPredictor::PrepareArgument() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); @@ -390,6 +393,16 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); } +#ifdef PADDLE_WITH_MKLDNN + if (config_.mkldnn_quantizer_enabled()) { + LOG(INFO) << "Quantization is enabled"; + argument_.SetQuantizeEnabledOpTypes( + config_.mkldnn_quantizer_config()->enabled_op_types()); + argument_.SetQuantizeExcludedOpIds( + config_.mkldnn_quantizer_config()->excluded_op_ids()); + } +#endif + auto passes = config_.pass_builder()->AllPasses(); if (!config_.ir_optim()) { passes.clear(); @@ -398,6 +411,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetScopeNotOwned(scope_.get()); +} + +// NOTE All the members in AnalysisConfig should be copied to Argument. +void AnalysisPredictor::OptimizeInferenceProgram() { + status_program_optimized_ = true; + + PrepareArgument(); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid()); @@ -439,12 +459,31 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< } std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config)); - if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) { + auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get()); + + if (!predictor_p->Init(nullptr)) { + return nullptr; + } + + if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) { return nullptr; } + return predictor; } +bool AnalysisPredictor::MkldnnQuantize() { +#if PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_) + mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer( + *this, config_.mkldnn_quantizer_config()); + return mkldnn_quantizer_->Quantize(); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + return false; +#endif +} + void AnalysisPredictor::PrepareFeedFetch() { PADDLE_ENFORCE_NOT_NULL(sub_scope_); CreateFeedFetchVar(sub_scope_); @@ -703,6 +742,13 @@ AnalysisPredictor::~AnalysisPredictor() { scope_->DeleteScope(sub_scope_); } +#if PADDLE_WITH_MKLDNN + if (mkldnn_quantizer_) { + delete mkldnn_quantizer_; + mkldnn_quantizer_ = nullptr; + } +#endif + // TODO(Superjomn) deduce the directory path. std::string out_path = inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 087bfbd002..e4c537f426 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -70,6 +70,7 @@ class AnalysisPredictor : public PaddlePredictor { void CreateFeedFetchVar(framework::Scope *scope); void PrepareFeedFetch(); + void PrepareArgument(); void OptimizeInferenceProgram(); Argument &analysis_argument() { return argument_; } @@ -83,6 +84,8 @@ class AnalysisPredictor : public PaddlePredictor { std::string GetSerializedProgram() const override; + bool MkldnnQuantize(); + protected: // For memory optimization. bool need_collect_var_shapes_for_memory_optim(); @@ -143,6 +146,16 @@ class AnalysisPredictor : public PaddlePredictor { std::vector<framework::OpDesc *> fetches_; std::map<size_t, std::string> idx2fetches_; +#if PADDLE_WITH_MKLDNN + // Helper class to perform quantization + class MkldnnQuantizer; + MkldnnQuantizer *mkldnn_quantizer_{nullptr}; + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif +#endif + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector<framework::LoDTensor> feed_tensors_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 6696839b53..0429a287c7 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -17,9 +17,13 @@ #include <gtest/gtest.h> #include <thread> // NOLINT #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif DEFINE_string(dirname, "", "dirname to tests."); @@ -243,4 +247,241 @@ TEST(AnalysisPredictor, memory_optim) { inference::CompareResult(output, output1); } +#ifdef PADDLE_WITH_MKLDNN +class MkldnnQuantizerTest : public testing::Test { + public: + MkldnnQuantizerTest() { + AnalysisConfig config(FLAGS_dirname); + + predictor.reset(new AnalysisPredictor(config)); + auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get()); + + auto qconfig = std::make_shared<MkldnnQuantizerConfig>(); + + mkldnn_quantizer.reset( + new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig)); + } + + std::pair<std::vector<int>, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + int num_bins) const { + return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins); + } + + std::pair<bool, framework::LoDTensor> GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned); + } + + std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned); + } + + std::pair<bool, framework::LoDTensor> GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned); + } + + protected: + std::unique_ptr<PaddlePredictor> predictor; + std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer; + float abs_error = 1e-6; + static const std::array<float, 10> non_negative_values; + static const std::array<float, 10> positive_and_negative_values; +}; + +const std::array<float, 10> MkldnnQuantizerTest::non_negative_values = { + 0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628, + 0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741}; +const std::array<float, 10> MkldnnQuantizerTest::positive_and_negative_values = + {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586, + -0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089}; + +TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) { + // all non-negative values + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + std::vector<int> histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 4); + ASSERT_EQ(histogram[1], 4); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) { + const auto& values = positive_and_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + std::vector<int> histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 3); + ASSERT_EQ(histogram[1], 5); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_zero_bins) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_empty) { + // empty tensor + ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet); + + // zero tensor + framework::LoDTensor var_tensor; + var_tensor.Resize({0}); + ASSERT_TRUE(var_tensor.mutable_data<double>(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0899106152344, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + int channels = 3; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size())); + for (int i = 0; i < channels; i++) + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace()) + + i * values.size()); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), channels); + for (int i = 0; i < channels; i++) { + ASSERT_NEAR(lod_tensor.data<double>()[i], 1.0 / max_val, abs_error); + } +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { + const auto& values = non_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data<float>(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0252845321362, abs_error); +} +#endif + } // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc new file mode 100644 index 0000000000..de75e884f5 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -0,0 +1,437 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#include <algorithm> +#include <map> +#include <numeric> +#include <unordered_map> +#include <utility> +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { + +using platform::CPUPlace; +using framework::LoDTensor; +using framework::ir::Graph; +using ConstEigenVectorArrayMap = + Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>; +using string::PrettyLogH1; + +bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { + PrettyLogH1("--- Calculating scales for quantization"); + using VariableNameMap = std::map<std::string, std::vector<std::string>>; + std::map<std::string, std::map<std::string, LoDTensor>> gathered_data; + for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) { + if (op->HasAttr("use_quantizer") && + boost::get<bool>(op->GetAttr("use_quantizer"))) { + const VariableNameMap& connections_in = op->Inputs(); + const VariableNameMap& connections_out = op->Outputs(); + + auto glambda = [&](const VariableNameMap& connections, bool is_output) { + for (auto const& conn : connections) { + if (conn.second.size() == 0) continue; + auto& var_name = conn.second[0]; + + // skip if scale already computed + if (scales_.find(var_name) != scales_.end()) return; + + auto* var = predictor_.sub_scope_->FindVar(var_name); + PADDLE_ENFORCE(var, "%s is not in the scope", var_name); + PADDLE_ENFORCE(var->IsType<LoDTensor>(), + "Only support lod tensor now."); + LoDTensor* var_tensor = var->GetMutable<LoDTensor>(); + + // force unsigned type if already know it + bool is_unsigned = false; + if (is_output && op->Type() == "conv2d") { + // output of conv2d with relu must be unsigned + is_unsigned = op->HasAttr("fuse_relu") && + boost::get<bool>(op->GetAttr("fuse_relu")); + } else if (is_output && op->Type() == "pool2d") { + // output of pool2d with unsigned input must be unsigned + auto input_var_name = op->Input("X")[0]; + if (scales_.find(input_var_name) != scales_.end()) { + is_unsigned = scales_[input_var_name].first; + } + } + + CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor, + is_unsigned); + } + }; + + // handle outputs first so unsigned outputs could be inferred + glambda(connections_out, true /* is_output */); + glambda(connections_in, false /* is_output */); + } + } + + return true; +} + +void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale( + const std::string& op_type_name, const std::string& conn_name, + const std::string& var_name, const LoDTensor& var_tensor, + bool is_unsigned) { + auto rule = qconfig_->scale_algo(op_type_name, conn_name); + if (rule == ScaleAlgo::NONE) return; + + PADDLE_ENFORCE( + var_tensor.numel() > 0, + "MkldnnQuantizer: LoDTensor of variable %s for quantization of op " + "%s of connection %s should not be empty.", + var_name, op_type_name, conn_name); + + switch (rule) { + case ScaleAlgo::MAX: + scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::MAX_CH: + scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::KL: + scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned); + break; + default: + throw std::runtime_error( + "MkldnnQuantizer: Unexpected ScaleAlgo specified."); + } +} + +std::vector<int> AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins( + std::vector<int> quantized_bins, std::vector<int> reference_bins) const { + std::vector<int> expanded_quantized_bins(reference_bins.size(), 0); + int num_merged_bins = reference_bins.size() / quantized_bins.size(); + int j_start = 0; + int j_end = num_merged_bins; + for (size_t idx = 0; idx < quantized_bins.size(); idx++) { + int zero_count = + std::count(&reference_bins[j_start], &reference_bins[j_end], 0); + num_merged_bins = j_end - j_start; + int avg_bin_ele; + if (zero_count == num_merged_bins) { + avg_bin_ele = 0; + } else { + avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0); + } + for (int idx1 = j_start; idx1 < j_end; idx1++) { + expanded_quantized_bins[idx1] = + (reference_bins[idx1] == 0) ? 0 : avg_bin_ele; + } + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == quantized_bins.size() - 1) { + j_end = reference_bins.size(); + } + } + return expanded_quantized_bins; +} + +std::pair<bool, LoDTensor> +AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(), + var_tensor.numel(), 1}; + int precision_hist_num_bins = 2048; + float max_val = eigen_tensor.maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + bool is_positive = min_val >= 0.0f; + if (is_unsigned) + PADDLE_ENFORCE( + is_positive, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int num_quantized_bins = 255; + + std::vector<int> hist; + float bin_width; + int starting_iter; + int ending_iter = precision_hist_num_bins - 1; + if (is_positive) { + std::tie(hist, bin_width) = + Histogram(var_tensor, min_val, max_val, precision_hist_num_bins); + starting_iter = static_cast<int>(ending_iter * 0.7); + } else { + float th = std::max(std::abs(max_val), std::abs(min_val)); + std::tie(hist, bin_width) = + Histogram(var_tensor, -th, th, precision_hist_num_bins); + starting_iter = 0; + if (std::abs(max_val) > std::abs(min_val)) { + while (starting_iter < ending_iter) { + if (hist[starting_iter] == 0) { + ++starting_iter; + continue; + } else { + break; + } + } + starting_iter += static_cast<int>((ending_iter - starting_iter) * 0.6); + } else { + while (ending_iter > 0) { + if (hist[ending_iter] == 0) { + --ending_iter; + continue; + } else { + break; + } + } + starting_iter = static_cast<int>(0.6 * ending_iter); + } + } + auto P_sum = eigen_tensor.size(); + int min_kl_divergence = 0; + int min_kl_index = 0; + bool kl_inited = false; + for (int i = starting_iter; i <= ending_iter; i++) { + std::vector<int> reference_distr_P(&hist[0], &hist[i]); + auto outliers_count = + std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0); + if (reference_distr_P[i - 1] == 0) { + continue; + } + reference_distr_P[i - 1] += outliers_count; + auto reference_distr_bins = reference_distr_P; + std::vector<int> candidate_distr_Q(&hist[0], &hist[i]); + int num_merged_bins = i / num_quantized_bins; + std::vector<int> candidate_distr_Q_quantized(num_quantized_bins, 0); + int j_start = 0; + int j_end = num_merged_bins; + for (int idx = 0; idx < num_quantized_bins; idx++) { + candidate_distr_Q_quantized[idx] = std::accumulate( + &candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0); + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == num_quantized_bins - 1) { + j_end = i; + } + } + candidate_distr_Q = + ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins); + int Q_sum = + std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0); + auto kl_divergence = + SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum); + if (!kl_inited) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + kl_inited = true; + } else if (kl_divergence < min_kl_divergence) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + } else { + } + } + if (min_kl_index == 0) { + while (starting_iter > 0) { + if (hist[starting_iter] == 0) { + starting_iter -= 1; + continue; + } else { + break; + } + } + min_kl_index = starting_iter; + } + + LoDTensor scale_tensor; + scale_tensor.Resize({1}); + auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace()); + + scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width); + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair<bool, LoDTensor> +AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(), + var_tensor.numel(), 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + LoDTensor scale_tensor; + scale_tensor.Resize({1}); + auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace()); + scale_ptr[0] = 1.0 / max_abs; + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair<bool, LoDTensor> +AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty."); + + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(), + var_tensor.numel(), 1}; + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int channels = var_tensor.dims()[0]; + LoDTensor scale_tensor; + scale_tensor.Resize({channels}); + auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace()); + + for (int i = 0; i < channels; ++i) { + const auto tensor = var_tensor.Slice(i, i + 1); + + ConstEigenVectorArrayMap eigen_tensor{tensor.data<float>(), tensor.numel(), + 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + scale_ptr[i] = 1.0 / max_abs; + } + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair<std::vector<int>, float> +AnalysisPredictor::MkldnnQuantizer::Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins) const { + PADDLE_ENFORCE_GT(num_bins, 0, + "MkldnnQuantizer: To calculate Histogram, num_bins (" + + std::to_string(num_bins) + ") must be positive."); + PADDLE_ENFORCE_GT( + var_tensor.numel(), 0, + "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty."); + PADDLE_ENFORCE(max_val >= min_val, + "MkldnnQuantizer: To calculate Histogram, max_val (" + + std::to_string(max_val) + + ") must be greater or equal" + "to min_val (" + + std::to_string(min_val) + ")."); + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(), + var_tensor.numel(), 1}; + auto bin_width = std::abs(max_val - min_val) / num_bins; + std::vector<int> hist(num_bins); + + for (int i = 0; i < eigen_tensor.size(); i++) { + int bin = std::min( + num_bins - 1, + static_cast<size_t>(floor((eigen_tensor[i] - min_val) / bin_width))); + ++hist[bin]; + } + + return std::make_pair(std::move(hist), std::move(bin_width)); +} + +void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { + auto& arg = predictor_.argument_; + if (!arg.scope_valid()) arg.SetScope(new framework::Scope); + arg.SetMainProgramNotOwned(predictor_.inference_program_.get()); + auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program())); + arg.SetMainGraph(graph.release()); + arg.main_graph().Set(framework::ir::kParamScopeAttr, + new framework::Scope*(arg.scope_ptr())); + + auto* builder = predictor_.config_.pass_builder(); + builder->SetPasses({ + "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass", + }); + if (predictor_.config_.ir_debug_) builder->TurnOnDebug(); + auto passes = builder->AllPasses(); + predictor_.argument_.SetIrAnalysisPasses(passes); + predictor_.argument_.SetAnalysisPasses( + {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"}); + predictor_.argument_.SetQuantVarScales(scales_); +} + +bool AnalysisPredictor::MkldnnQuantizer::Quantize() { + if (!RunWarmup()) return false; + if (!CalculateScales()) return false; + predictor_.PrepareScope(predictor_.scope_); + predictor_.CreateExecutor(); + if (!RunQuantizePasses()) return false; + predictor_.PrepareExecutor(); + predictor_.PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const { + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true, + predictor_.sub_scope_); + PrepareArgument(); + auto& arg = predictor_.argument_; + Analyzer().Run(&arg); + PADDLE_ENFORCE(arg.scope_valid()); + VLOG(5) << "to prepare executor"; + ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program); + predictor_.inference_program_.reset( + new framework::ProgramDesc(arg.ir_analyzed_program())); + LOG(INFO) << "== optimize 2 end =="; + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, + false, predictor_.sub_scope_); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const { + VLOG(3) << "Predictor: run a quantization warmup iteration"; + auto warmup_data = qconfig_->warmup_data(); + PADDLE_ENFORCE_NOT_NULL(warmup_data, + "Warmup data cannot be NULL in the config."); + PrettyLogH1("--- Running warmup iteration for quantization"); + + // Run the inference program + std::vector<PaddleTensor> output_slots; + predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size()); + + return true; +} + +float AnalysisPredictor::MkldnnQuantizer::SafeEntropy( + std::vector<int> reference_distr_P, int P_sum, + std::vector<int> candidate_distr_Q, int Q_sum) const { + PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size()); + float tmp_sum1 = 0; + float tmp_sum2 = 0; + for (size_t idx = 0; idx < reference_distr_P.size(); idx++) { + int p_idx = reference_distr_P[idx]; + int q_idx = candidate_distr_Q[idx]; + if (p_idx == 0) { + tmp_sum1 += 0; + tmp_sum2 += 0; + } else { + PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " + + std::to_string(idx) + + " qindex = 0! p_idx = " + + std::to_string(p_idx)); + } + tmp_sum1 += p_idx * (log(Q_sum * p_idx)); + tmp_sum2 += p_idx * (log(P_sum * q_idx)); + } + return (tmp_sum1 - tmp_sum2) / P_sum; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h new file mode 100644 index 0000000000..f4b0df5d74 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.h @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include <algorithm> +#include <map> +#include <memory> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_TESTING +#include <gtest/gtest.h> +#include <gtest/gtest_prod.h> +#endif + +namespace paddle { + +/* + * Map variable name to tensor of scaling factors scaling it to MAX=1.0. + * bool denotes whether quantization of the variable should be done to unsigned + * type. + */ +using VarQuantScale = + std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>; + +class AnalysisPredictor::MkldnnQuantizer { + public: + explicit MkldnnQuantizer( + AnalysisPredictor& predictor, // NOLINT + const std::shared_ptr<MkldnnQuantizerConfig>& qconfig) + : predictor_(predictor), qconfig_(qconfig) {} + + // Execute full quantization procedure. + bool Quantize(); + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif + + private: + // Run single warmup iteration + bool RunWarmup() const; + // Gather data from variables and calculate scales for them. + bool CalculateScales(); + // Calculate a scale for tensor based on ScaleAlgo rules. + void CalculateSingleScale(const std::string& op_name, + const std::string& conn_name, + const std::string& var_name, + const framework::LoDTensor& var_tensor, + bool is_unsigned); + void PrepareArgument() const; + bool RunQuantizePasses() const; + + std::vector<int> ExpandQuantizedBins(std::vector<int> quantized_bins, + std::vector<int> reference_bins) const; + + // Using the KL-divergence method get the most precise scaling factor. + std::pair<bool, framework::LoDTensor> GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair<bool, framework::LoDTensor> GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + // Returns histogram and bin width + std::pair<std::vector<int>, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins = 2048) const; + + // Calculate the entropy. + float SafeEntropy(std::vector<int> reference_distr_P, int P_sum, + std::vector<int> candidate_distr_Q, int Q_sum) const; + + private: + AnalysisPredictor& predictor_; + const std::shared_ptr<MkldnnQuantizerConfig> qconfig_; + + // A map: variable name -> scale + VarQuantScale scales_; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc new file mode 100644 index 0000000000..f9ff542d86 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h" + +namespace paddle { + +MkldnnQuantizerConfig::MkldnnQuantizerConfig() { + // The default configuration of scale computing algorightms + rules_["conv2d"]["Input"] = ScaleAlgo::KL; + rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH; + rules_["conv2d"]["Bias"] = ScaleAlgo::NONE; // do not compute scale + rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL; + rules_["conv2d"]["Output"] = ScaleAlgo::KL; // do not compute scale + + rules_["pool2d"]["X"] = ScaleAlgo::KL; + rules_["pool2d"]["Out"] = ScaleAlgo::KL; // do not compute scale +} + +ScaleAlgo MkldnnQuantizerConfig::scale_algo( + const std::string& op_type_name, const std::string& conn_name) const { + if (rules_.find(op_type_name) != rules_.end()) { + auto op_rule = rules_.at(op_type_name); + if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name); + } + return default_scale_algo_; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 23df507aa6..2ad4add294 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -27,10 +27,14 @@ // the abstract path of this header file will be changed. #include "paddle_api.h" // NOLINT #include "paddle_pass_builder.h" // NOLINT +#ifdef PADDLE_WITH_MKLDNN +#include "paddle_mkldnn_quantizer_config.h" // NOLINT +#endif namespace paddle { class AnalysisPredictor; +struct MkldnnQuantizerConfig; // NOTE WIP, not stable yet. struct AnalysisConfig { @@ -186,6 +190,16 @@ struct AnalysisConfig { mkldnn_enabled_op_types_ = op_list; } + /** Turn on quantization. + */ + void EnableMkldnnQuantizer(); + + /** A boolean state telling whether the quantization is enabled. + */ + bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; } + + std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config() const; + /** Specify the memory buffer of program and parameter * @param prog_buffer the memory buffer of program. * @param prog_buffer_size the size of the data. @@ -271,10 +285,14 @@ struct AnalysisConfig { std::string serialized_info_cache_; mutable std::unique_ptr<PassStrategy> pass_builder_; + bool use_anakin_{false}; int anakin_max_batchsize_; std::map<std::string, std::vector<int>> anakin_max_input_shape_; std::map<std::string, std::string> engine_opt_info_; + + bool use_mkldnn_quantizer_{false}; + std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h new file mode 100644 index 0000000000..d46f842de7 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include <cassert> +#include <map> +#include <memory> +#include <string> +#include <unordered_set> +#include <vector> + +#include "paddle_api.h" // NOLINT + +namespace paddle { + +// Algorithms for finding scale of quantized Tensors. +enum class ScaleAlgo { + NONE, // Do not compute scale + MAX, // Find scale based on the maximum absolute value + MAX_CH, // Find scale based on the maximum absolute value per channel + KL, // Find scale based on KL Divergence +}; + +struct MkldnnQuantizerConfig { + MkldnnQuantizerConfig(); + + /** Specify a quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @param algo the algorithm for computing scale. + */ + void SetScaleAlgo(std::string op_type_name, std::string conn_name, + ScaleAlgo algo) { + rules_[op_type_name][conn_name] = algo; + } + + /** Get the quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @return the algorithm for computing scale. + */ + ScaleAlgo scale_algo(const std::string& op_type_name, + const std::string& conn_name) const; + + /** Set the batch of data to be used for warm-up iteration. + * @param data batch of data. + */ + void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) { + warmup_data_ = data; + } + + /** Get the batch of data used for warm-up iteration. + * @return batch of data. + */ + std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const { + return warmup_data_; + } + + void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; } + + int warmup_batch_size() const { return warmup_bs_; } + + void SetEnabledOpTypes(std::unordered_set<std::string> op_list) { + enabled_op_types_ = op_list; + } + + const std::unordered_set<std::string>& enabled_op_types() const { + return enabled_op_types_; + } + + void SetExcludedOpIds(std::unordered_set<int> op_ids_list) { + excluded_op_ids_ = op_ids_list; + } + + const std::unordered_set<int>& excluded_op_ids() const { + return excluded_op_ids_; + } + + void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; } + + ScaleAlgo default_scale_algo() const { return default_scale_algo_; } + + protected: + std::map<std::string, std::map<std::string, ScaleAlgo>> rules_; + std::unordered_set<std::string> enabled_op_types_; + std::unordered_set<int> excluded_op_ids_; + std::shared_ptr<std::vector<PaddleTensor>> warmup_data_; + int warmup_bs_{1}; + ScaleAlgo default_scale_algo_{ScaleAlgo::MAX}; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 35dd117671..8ec32b3a0b 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -107,8 +107,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { use_gpu_ = true; } -void GpuPassStrategy::EnableQuantizer() { - LOG(ERROR) << "GPU not support quantization yet"; +void GpuPassStrategy::EnableMkldnnQuantizer() { + LOG(ERROR) << "GPU not support MKL-DNN quantization"; } void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 65403e790e..de60185eb3 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -30,6 +30,10 @@ class PaddlePassBuilder { explicit PaddlePassBuilder(const std::vector<std::string> &passes) : passes_(passes) {} + void SetPasses(std::initializer_list<std::string> passes) { + passes_ = passes; + } + /** Append a pass to the end of the passes. */ void AppendPass(const std::string &pass_type); @@ -85,9 +89,9 @@ class PassStrategy : public PaddlePassBuilder { */ virtual void EnableMKLDNN() {} - /** Enable quantize optimization + /** Enable MKLDNN quantize optimization */ - virtual void EnableQuantizer() {} + virtual void EnableMkldnnQuantizer() {} bool use_gpu() const { return use_gpu_; } @@ -130,15 +134,19 @@ class CpuPassStrategy : public PassStrategy { #endif } - void EnableQuantizer() override { - if (!use_quantizer_) { + void EnableMkldnnQuantizer() override { +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_quantizer_) { passes_.push_back("cpu_quantize_placement_pass"); } - use_quantizer_ = true; + use_mkldnn_quantizer_ = true; +#else + use_mkldnn_quantizer_ = false; +#endif } protected: - bool use_quantizer_{false}; + bool use_mkldnn_quantizer_{false}; }; /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. @@ -153,7 +161,7 @@ class GpuPassStrategy : public PassStrategy { } void EnableMKLDNN() override; - void EnableQuantizer() override; + void EnableMkldnnQuantizer() override; virtual ~GpuPassStrategy() = default; }; From 2d8b7b3a766c2aa707a1f27d2901bd9b75d98f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Gallus?= <Sand3r-@users.noreply.github.com> Date: Wed, 27 Mar 2019 21:42:53 +0100 Subject: [PATCH 08/19] Refine default MKL-DNN Pass order (#16490) * Refine default MKL-DNN Pass order test=develop * Add comment to default MKL-DNN Pass list test=develop --- paddle/fluid/inference/api/paddle_pass_builder.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index de60185eb3..48da8c156f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -121,6 +121,8 @@ class CpuPassStrategy : public PassStrategy { for (auto &pass : std::vector<std::string>( {"depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order "conv_bias_mkldnn_fuse_pass", // "conv3d_bias_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", // From b1d2605152e70acc1ba3d82dd693dcc47d128390 Mon Sep 17 00:00:00 2001 From: baojun <32073718+baojun-nervana@users.noreply.github.com> Date: Wed, 27 Mar 2019 18:05:01 -0700 Subject: [PATCH 09/19] fix compile issue test=develop (#16447) --- cmake/external/ngraph.cmake | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index e7fb69dbbc..23998b497e 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -57,20 +57,25 @@ SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) ExternalProject_Add( ${NGRAPH_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} - DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} - GIT_REPOSITORY ${NGRAPH_GIT_REPO} - GIT_TAG ${NGRAPH_GIT_TAG} - PREFIX ${NGRAPH_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} - CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE - CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE - CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE - CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE - CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} - CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} - CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib + DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} + GIT_REPOSITORY ${NGRAPH_GIT_REPO} + GIT_TAG ${NGRAPH_GIT_TAG} + PREFIX ${NGRAPH_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_GENERATOR ${CMAKE_GENERATOR} + CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM} + CMAKE_GENERATOR_TOOLSET ${CMAKE_GENERATOR_TOOLSET} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} + CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} + CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} + CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib ) add_dependencies(ngraph ${NGRAPH_PROJECT}) From eb83abeac3c0146b921ce72d06fef2551ab3e8d8 Mon Sep 17 00:00:00 2001 From: gongweibao <weibao.gong@gmail.com> Date: Thu, 28 Mar 2019 09:23:47 +0800 Subject: [PATCH 10/19] Add DGC(Deep Gradient Compression) interface. (#15841) --- CMakeLists.txt | 6 + cmake/external/dgc.cmake | 42 +++ cmake/inference_lib.cmake | 9 + cmake/operators.cmake | 2 +- paddle/fluid/API.spec | 5 + paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../framework/details/all_reduce_deps_pass.cc | 7 +- .../framework/details/all_reduce_op_handle.cc | 200 ++++++++++++- .../framework/details/all_reduce_op_handle.h | 16 +- .../details/multi_devices_graph_pass.cc | 33 ++- .../details/multi_devices_graph_pass.h | 5 +- paddle/fluid/framework/details/var_handle.cc | 3 +- paddle/fluid/framework/op_desc.cc | 6 + paddle/fluid/framework/op_desc.h | 1 + paddle/fluid/framework/operator.cc | 5 +- paddle/fluid/inference/CMakeLists.txt | 5 + paddle/fluid/operators/CMakeLists.txt | 8 +- paddle/fluid/operators/clip_by_norm_op.cc | 61 +--- paddle/fluid/operators/clip_by_norm_op.h | 54 ++++ paddle/fluid/operators/dgc_clip_by_norm_op.cc | 67 +++++ paddle/fluid/operators/dgc_clip_by_norm_op.cu | 20 ++ paddle/fluid/operators/dgc_clip_by_norm_op.h | 46 +++ paddle/fluid/operators/dgc_op.cc | 138 +++++++++ paddle/fluid/operators/dgc_op.cu | 20 ++ paddle/fluid/operators/dgc_op.h | 132 +++++++++ paddle/fluid/platform/CMakeLists.txt | 6 +- paddle/fluid/platform/assert.h | 14 +- paddle/fluid/platform/device_context.cc | 15 +- paddle/fluid/platform/init.cc | 18 ++ paddle/fluid/platform/init.h | 2 + paddle/fluid/pybind/protobuf.cc | 1 + paddle/fluid/pybind/pybind.cc | 1 + python/paddle/fluid/framework.py | 16 ++ python/paddle/fluid/optimizer.py | 272 +++++++++++++++++- python/paddle/fluid/parallel_executor.py | 6 + .../fluid/tests/unittests/CMakeLists.txt | 8 +- .../fluid/tests/unittests/dist_mnist.py | 8 +- .../fluid/tests/unittests/dist_se_resnext.py | 20 +- .../fluid/tests/unittests/test_dgc_op.py | 138 +++++++++ .../fluid/tests/unittests/test_dist_base.py | 16 +- .../fluid/tests/unittests/test_dist_mnist.py | 14 + .../tests/unittests/test_dist_se_resnext.py | 15 + 42 files changed, 1363 insertions(+), 100 deletions(-) create mode 100644 cmake/external/dgc.cmake create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.cc create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.cu create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.h create mode 100644 paddle/fluid/operators/dgc_op.cc create mode 100644 paddle/fluid/operators/dgc_op.cu create mode 100644 paddle/fluid/operators/dgc_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_dgc_op.py diff --git a/CMakeLists.txt b/CMakeLists.txt index a38e32b73d..9ad69738eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,6 +193,12 @@ if(WITH_GPU) include(tensorrt) include(anakin_subgraph) endif() + +if(WITH_GPU AND NOT WIN32) + message(STATUS "add dgc lib.") + include(external/dgc) +endif() + if(WITH_MKL OR WITH_MKLML) include(external/anakin) elseif() diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake new file mode 100644 index 0000000000..199ca88b47 --- /dev/null +++ b/cmake/external/dgc.cmake @@ -0,0 +1,42 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc") +SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") +SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE) +SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE) +INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dgc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet" + GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc" + SOURCE_DIR "${DGC_SOURCES_DIR}" + CONFIGURE_COMMAND "" + BUILD_COMMAND cd collective && make -j + INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc + && cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES} + && cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ + BUILD_IN_SOURCE 1 +) + +ADD_LIBRARY(dgc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES}) +ADD_DEPENDENCIES(dgc extern_dgc) + +LIST(APPEND external_project_dependencies dgc) + diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index a7dce4dfdb..b7c32f80db 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -131,6 +131,15 @@ elseif (NOT CBLAS_FOUND OR WIN32) ) endif () +if (WITH_GPU AND NOT WIN32) + set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc") + copy(dgc_lib + SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include + DSTS ${dgc_dir} ${dgc_dir} + DEPS dgc) +endif() + + if (WITH_MKLDNN) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") copy(mkldnn_lib diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 34c6cbd73d..c17e718f42 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -110,7 +110,7 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" -"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op") +"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 851308a0f6..e6f5cb7473 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -483,6 +483,11 @@ paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['sel paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24')) paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 77e94e998c..046ec6978a 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -23,7 +23,7 @@ endif() if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda variable_visitor) + dynload_cuda variable_visitor dgc) nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) if(WITH_DISTRIBUTE) diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index c084410864..98a74d630c 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -86,7 +86,8 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( } } - VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl; + VLOG(10) << "dist_ops size:" << dist_ops.size() + << ", outputs size:" << vars.size() << ", ops size:" << ops.size(); std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1, OpHandleBase* op2) { @@ -99,6 +100,10 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( auto l_it = vars.find(i0->name()); auto r_it = vars.find(i1->name()); + PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(), + "can't find var's name %s and %s in opdesc", i0->name(), + i1->name()); + if (l_it->second < r_it->second) return true; if (l_it->second == r_it->second) { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index fdaff08e53..6e477cd297 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -16,6 +16,13 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/framework/operator.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "dgc/dgc.h" +#endif + +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" // asynchronous nccl allreduce or synchronous issue: @@ -33,11 +40,14 @@ namespace details { AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, const std::vector<platform::Place> &places, - const platform::NCCLContextMap *ctxs) + const platform::NCCLContextMap *ctxs, + bool is_encoded, int nranks) : OpHandleBase(node), local_scopes_(local_scopes), places_(places), - nccl_ctxs_(ctxs) { + nccl_ctxs_(ctxs), + is_encoded_(is_encoded), + nranks_(nranks) { if (nccl_ctxs_) { for (auto &p : places_) { this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); @@ -51,7 +61,185 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} #endif +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void AllReduceOpHandle::RunImplEncoded() { + platform::RecordEvent record_event(Name()); + + WaitInputVarGenerated(); + + auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); + auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + std::vector<const LoDTensor *> ins; + std::vector<LoDTensor *> outs; + int k = -1; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &local_scope = + local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>(); + auto original_name = + paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); + auto encode_var_name = original_name + g_dgc_encoded; + auto *in_var = local_scope->FindVar(encode_var_name); + PADDLE_ENFORCE_NOT_NULL(in_var); + auto &in = in_var->Get<LoDTensor>(); + ins.emplace_back(&in); + + auto *out = local_scope->FindVar(out_var_handles[i]->name()) + ->GetMutable<LoDTensor>(); + outs.emplace_back(out); + + if (k < 0) { + k = GetKValue(in_var_handles[i]->name()); + } + } + + PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); + PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + + int dtype = -1; + size_t in_numel = 0; + size_t out_numel = 0; + PADDLE_ENFORCE(nranks_ > 1); + std::vector<std::function<void()>> all_reduce_calls; + + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &place = places_[i]; + auto &in = *ins[i]; + void *in_tensor_buf = const_cast<void *>(in.data<void>()); + + auto &out = *outs[i]; + float *out_tensor_buf = out.data<float>(); + + dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; + in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel; + PADDLE_ENFORCE(in_numel % 2 == 0); + PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k)); + out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel; + + int dev_id = boost::get<platform::CUDAPlace>(place).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(place, stream); + int encode_size = 2 * k * sizeof(int); + // dgc use ncclAllGather to get all the encoded data + // so the buffer need nranks. + int buf_size = nranks_ * encode_size; + auto tmp_ious_data = allocator.Allocate(buf_size); + void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr()); + + VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel + << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size + << ", k:" << k << ", place:" << place << ", dtype:" << dtype; + + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce( + in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm, + stream)); + }); + } + + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } + }); + + if (FLAGS_sync_nccl_allreduce) { + for (auto &p : places_) { + int dev_id = boost::get<platform::CUDAPlace>(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + cudaError_t e_sync = cudaStreamSynchronize(stream); + if (e_sync != 0) { + LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync); + } + + cudaError_t e_get = cudaGetLastError(); + if (e_get != 0) { + LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get) + << " errno:" << e_get; + } + } + } +} + +int AllReduceOpHandle::GetKValue(const std::string &grad_name) { + auto original_name = paddle::framework::GradOriginalVarName(grad_name); + auto var_name = original_name + g_dgc_k; + PADDLE_ENFORCE(local_scopes_.size() > 0); + + auto *scope = local_scopes_[0]; + auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>(); + auto var = local_scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var); + auto tensor = var->Get<LoDTensor>().data<float>(); + return *tensor; +} +#endif + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +bool AllReduceOpHandle::IsEncoded() { + if (!is_encoded_) { + return false; + } + auto counter_name = g_dgc_counter_name; + auto step_name = g_dgc_rampup_begin_step; + PADDLE_ENFORCE(local_scopes_.size() > 0); + + auto *scope = local_scopes_[0]; + auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>(); + auto count_var = local_scope->FindVar(counter_name); + auto step_var = local_scope->FindVar(step_name); + if (count_var == nullptr || step_var == nullptr) { + PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, + step_var); + } + + float count = *count_var->Get<LoDTensor>().data<float>(); + float step = *step_var->Get<LoDTensor>().data<float>(); + if (static_cast<int>(count) < static_cast<int>(step)) { + VLOG(10) << "in all_reduce currentstep:" << count + << " < rampup_begin_step:" << step + << " so not use sparse all reduce"; + return false; + } + + return true; +} +#else +bool AllReduceOpHandle::IsEncoded() { return false; } +#endif + void AllReduceOpHandle::RunImpl() { + if (!IsEncoded()) { + RunImplNormal(); + return; + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + RunImplEncoded(); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif +} + +void AllReduceOpHandle::RunImplNormal() { platform::RecordEvent record_event(Name()); WaitInputVarGenerated(); @@ -72,6 +260,8 @@ void AllReduceOpHandle::RunImpl() { auto &lod_tensor = local_scope.FindVar(in_var_handles[i]->name())->Get<LoDTensor>(); lod_tensors.emplace_back(&lod_tensor); + VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() + << ", out_name:" << out_var_handles[i]->name(); PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), "The name of input and output should be equal."); } @@ -99,13 +289,17 @@ void AllReduceOpHandle::RunImpl() { auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; + + VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel + << ", dev_id:" << dev_id << ", dtype:" << dtype + << ", place:" << p; + all_reduce_calls.emplace_back([=] { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum, comm, stream)); }); } - this->RunAndRecordEvent([&] { if (all_reduce_calls.size() == 1UL) { // Do not use NCCLGroup when manage NCCL by per thread per device diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index b449796fca..ca75186f6c 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -28,11 +28,19 @@ namespace paddle { namespace framework { namespace details { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +constexpr char g_dgc_counter_name[] = "__g_dgc_counter__"; +constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__"; +constexpr char g_dgc_encoded[] = "__dgc_encoded__"; +constexpr char g_dgc_k[] = "__dgc_k__"; +#endif + struct AllReduceOpHandle : public OpHandleBase { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, const std::vector<platform::Place> &places, - const platform::NCCLContextMap *ctxs); + const platform::NCCLContextMap *ctxs, + bool is_encoded = false, int nranks = -1); #else AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, const std::vector<platform::Place> &places); @@ -50,8 +58,14 @@ struct AllReduceOpHandle : public OpHandleBase { std::vector<Scope *> local_scopes_; std::vector<platform::Place> places_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + void RunImplEncoded(); const platform::NCCLContextMap *nccl_ctxs_; + bool is_encoded_{false}; + int nranks_{-1}; + int GetKValue(const std::string &grad_name); #endif + void RunImplNormal(); + bool IsEncoded(); }; } // namespace details diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 253cf5b4a8..8c61684c9c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -32,6 +32,7 @@ #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/math/math_function.h" namespace paddle { namespace framework { @@ -209,7 +210,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( for (size_t i = 0; i < backward_vars.size(); i += 2) { auto &p_name = backward_vars[i]; auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name + << " op_type " << node->Op()->Type(); if (NeedCollectiveForGrad(g_name, sorted_ops)) { InsertCollectiveOp(&result, p_name, g_name); } @@ -414,8 +416,9 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, CreateOpHandleIOs(result, node, dev_id); } -void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( - ir::Graph *result, const std::string &og) const { +void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, + const std::string &og, + bool is_encoded) const { OpHandleBase *op_handle = nullptr; auto append_allreduce_op = [&]( @@ -424,7 +427,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), - scopes, places, nccl_ctxs_)); + scopes, places, nccl_ctxs_, is_encoded, + static_cast<int>(strategy_.trainers_endpoints_.size()) * + places_.size())); #else result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -446,12 +451,15 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad); + VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString(); auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), i, og, places_[i]); vars.emplace_back(var); op_handle->AddOutput(var); + VLOG(10) << "all_reduce_op_handle add output " << og + << ", handle:" << var->DebugString(); } } @@ -941,6 +949,17 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, return op_dev_id; } +bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const { + auto u_name = p_name + "__dgc_u__"; + auto it = all_vars_.find(u_name); + if (it == all_vars_.end()) { + VLOG(10) << "can't find u_name, so it's not encoded:" << u_name; + return false; + } + + return true; +} + void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const { @@ -956,7 +975,11 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, CreateReduceOp(result, g_name, 0); CreateBroadcastOp(result, g_name, 0); } else { - CreateAllReduceOp(result, g_name); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + CreateAllReduceOp(result, g_name, IsEncoded(p_name)); +#else + PADDLE_ENFORCE(false, "Compiled withoud cuda!"); +#endif } break; default: diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 0ee3a06062..8bfd7b9bf8 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -75,7 +75,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { bool IsSparseGradient(const std::string &og) const; - void CreateAllReduceOp(ir::Graph *result, const std::string &og) const; + void CreateAllReduceOp(ir::Graph *result, const std::string &og, + bool is_encoded = false) const; void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const; @@ -171,6 +172,8 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder { mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_; mutable bool need_broadcast_var_{false}; + + bool IsEncoded(const std::string &p_name) const; }; std::unordered_set<std::string> &MultiDevSSAGraphBuilder(); diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 30da029ca2..95d62e6641 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -24,7 +24,8 @@ VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; - ss << name_ << ":" << place_; + ss << "name:" << name_ << ", place:" << place_ << ", version:" << version_ + << ", scope_idx:" << scope_idx_; return ss.str(); } diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 8f9c6cb5e9..353db43521 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -373,6 +373,11 @@ std::vector<std::string> OpDesc::AttrNames() const { return retv; } +void OpDesc::RemoveAttr(const std::string &name) { + attrs_.erase(name); + need_update_ = true; +} + void OpDesc::SetAttr(const std::string &name, const Attribute &v) { // NOTICE(minqiyang): pybind11 will take the empty list in python as // the std::vector<int> type in C++; so we have to change the attr's type @@ -644,6 +649,7 @@ void OpDesc::CheckAttrs() { // not by users. return; } + VLOG(10) << "begin to check attribute of " << Type(); checker->Check(&attrs_); } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index d7352c5ee5..dedaf24364 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -72,6 +72,7 @@ class OpDesc { std::vector<std::string> AttrNames() const; void SetAttr(const std::string &name, const Attribute &v); + void RemoveAttr(const std::string &name); void SetBlockAttr(const std::string &name, BlockDesc *block); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index eef84d17a4..b0ac73f9f5 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1110,8 +1110,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type tmp = t->type(); PADDLE_ENFORCE( tmp == data_type || data_type == dafault_data_type, - "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", - Type(), DataTypeToString(data_type), DataTypeToString(tmp)); + "DataType of Paddle Op %s %s must be the same. Get (%d) != (%d)", + Type(), input.first, DataTypeToString(data_type), + DataTypeToString(tmp)); data_type = tmp; } } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 5e0be5d445..fb433ff2a2 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -49,6 +49,11 @@ set(SHARED_INFERENCE_SRCS ${mkldnn_quantizer_src} ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) +# FIXME(gongwb): hidden libdgc.a +if(WITH_GPU AND NOT WIN32) + set(fluid_modules ${fluid_modules} dgc) +endif() + if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index afac8e4d2a..e52e83673f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -48,7 +48,7 @@ if (WITH_DISTRIBUTE) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) endif() -register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) +register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) if (WITH_GPU) # warpctc_op needs cudnn 7 above @@ -72,6 +72,12 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) +if (WITH_GPU AND NOT WIN32) + op_library(dgc_op DEPS dgc) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n") + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc) +endif() + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc index eae86a373b..5720b295ec 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cc +++ b/paddle/fluid/operators/clip_by_norm_op.cc @@ -14,69 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/clip_by_norm_op.h" -namespace paddle { -namespace operators { - -class ClipByNormOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ClipByNormOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ClipByNormOp should not be null."); - auto max_norm = ctx->Attrs().Get<float>("max_norm"); - PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0."); - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor) The input of clip_by_norm op." - "The number of dimensions must be between [1, 9]."); - AddOutput("Out", - "(Tensor) The output of clip_by_norm op with shape as input(X)"); - AddAttr<float>("max_norm", "(float) The maximum norm value."); - AddComment(R"DOC( -ClipByNorm Operator. - -This operator limits the L2 norm of the input $X$ within $max\_norm$. -If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be -the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will -be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as -shown in the following formula: - -$$ -Out = \\frac{max\\_norm * X}{norm(X)}, -$$ - -where $norm(X)$ represents the L2 norm of $X$. - -Examples: - .. code-block:: python - - data = fluid.layer.data( - name='data', shape=[2, 4, 6], dtype='float32') - reshaped = fluid.layers.clip_by_norm( - x=data, max_norm=0.5) - -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, ops::ClipByNormOpMaker); + REGISTER_OP_CPU_KERNEL( clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 49e734ce96..d8baa4b8b2 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -83,5 +83,59 @@ class ClipByNormKernel : public framework::OpKernel<T> { } }; +class ClipByNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipByNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipByNormOp should not be null."); + auto max_norm = ctx->Attrs().Get<float>("max_norm"); + PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0."); + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input of clip_by_norm op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", + "(Tensor) The output of clip_by_norm op with shape as input(X)"); + AddAttr<float>("max_norm", "(float) The maximum norm value."); + AddComment(R"DOC( +ClipByNorm Operator. + +This operator limits the L2 norm of the input $X$ within $max\_norm$. +If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be +the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will +be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as +shown in the following formula: + +$$ +Out = \\frac{max\\_norm * X}{norm(X)}, +$$ + +where $norm(X)$ represents the L2 norm of $X$. + +Examples: + .. code-block:: python + + data = fluid.layer.data( + name='data', shape=[2, 4, 6], dtype='float32') + reshaped = fluid.layers.clip_by_norm( + x=data, max_norm=0.5) + +)DOC"); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc new file mode 100644 index 0000000000..6ebad4de3c --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include <string> + +#include "paddle/fluid/operators/dgc_clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +class DGCClipByNormOp : public ClipByNormOp { + public: + using ClipByNormOp::ClipByNormOp; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("current_step"), + "current_step should be set."); + + return ClipByNormOp::InferShape(ctx); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "current_step") { + VLOG(10) << "var_name:" << var_name << " need not to transform"; + return expected_kernel_type; + } + + return framework::OperatorWithKernel::GetKernelTypeForVar( + var_name, tensor, expected_kernel_type); + } +}; + +class DGCClipByNormOpMaker : public ClipByNormOpMaker { + public: + void Make() override { + AddInput("current_step", "(Tensor) Current step."); + AddAttr<float>("rampup_begin_step", + "(float, -1.0)" + "The period when begin k_select.") + .SetDefault(-1.0); + + return ClipByNormOpMaker::Make(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm, ops::DGCClipByNormOp, + ops::DGCClipByNormOpMaker); + +REGISTER_OP_CPU_KERNEL( + dgc_clip_by_norm, + ops::DGCClipByNormKernel<paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu new file mode 100644 index 0000000000..e7f564b7ab --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_clip_by_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + dgc_clip_by_norm, + ops::DGCClipByNormKernel<paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h new file mode 100644 index 0000000000..bd22d16f7a --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +template <typename DeviceContext, typename T> +class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rampup_begin_step = context.Attr<float>("rampup_begin_step"); + if (static_cast<int>(rampup_begin_step) >= 0) { + auto current_step_tensor = + context.Input<framework::Tensor>("current_step"); + auto* current_step = current_step_tensor->data<T>(); + + if (static_cast<int>(*current_step) < + static_cast<int>(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc_clip_by_norm"; + return; + } + } + + return ClipByNormKernel<DeviceContext, T>::Compute(context); + }; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc new file mode 100644 index 0000000000..ccdeea2d0a --- /dev/null +++ b/paddle/fluid/operators/dgc_op.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_op.h" +#include <string> +#include <vector> +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class DGCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("current_step"), + "Input(current_step) of DGCop should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("U_out"), + "Output(U_out) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("V_out"), + "Output(V_out) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("k"), + "Output(k) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"), + "Output(EncodeGrad) of DGCop should not be null."); + } + + protected: + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "current_step" || var_name == "rampup_step" || + var_name == "k") { + VLOG(10) << "var_name:" << var_name << " need not to transform"; + return expected_kernel_type; + } + + return framework::OperatorWithKernel::GetKernelTypeForVar( + var_name, tensor, expected_kernel_type); + } +}; + +class DGCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("U", "(Tensor) Middle tensor of DGC"); + AddInput("V", "(Tensor) Middle tensor of DGC"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("current_step", "(Tensor) Current step."); + + AddOutput("U_out", + "(Tensor) " + "Output encoded gradient"); + AddOutput("V_out", + "(Tensor) " + "Output encoded gradient"); + AddOutput("EncodeGrad", + "(Tensor) " + "Output encoded gradient"); + AddOutput("Grad_out", + "(Tensor) " + "Output grad gradient"); + AddOutput("k", + "(Tensor) " + "Output top-k value"); + + AddAttr<float>("m", + "(float, 0.9) " + "The momentum of learning rate.") + .SetDefault(0.9); + + AddAttr<bool>("use_nesterov", + "(bool, true)" + "The momentum of learning rate.") + .SetDefault(true); + + AddAttr<std::vector<float>>("sparsity", + "(vecotr, float)" + "The period sparsity of k_select."); + + AddAttr<float>("rampup_begin_step", + "(float, 0.0)" + "The period when begin k_select.") + .SetDefault(0.0); + + AddAttr<float>("rampup_step", + "(float, 0.0)" + "The period when begin k_select."); + + AddComment(R"DOC( + Original paper is https://arxiv.org/abs/1712.01887 + + DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\ + only gradients larger than a threshold are transmitted. + + To avoid losing information, DGC accumulate the rest of the gradients locally. + + Eventually, these gradients become large enough to be transmitted. + + Thus, DGC send the large gradients immediately but eventually send all of the gradients over time. + + To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance. + + DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. + + This optimizer will do two things: + + 1. Compress the gradient by get TopK import value from tensor \ + and use it for allreduce to reduce network bandwidth. + + 2. Call momentum to optimize on the cost. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dgc, ops::DGCOp, ops::DGCOpMaker); diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu new file mode 100644 index 0000000000..0f0bf441a7 --- /dev/null +++ b/paddle/fluid/operators/dgc_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + dgc, ops::DGCOpKernel<paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h new file mode 100644 index 0000000000..8d1683bdb2 --- /dev/null +++ b/paddle/fluid/operators/dgc_op.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include <vector> +#include "dgc/dgc.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" + +namespace paddle { +namespace operators { + +inline float get_period_sparcity(const std::vector<float>& sparsity, + float cur_step, float rampup_steps) { + PADDLE_ENFORCE(static_cast<int>(cur_step) >= 0); + + size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps); + if (idx >= sparsity.size()) { + return 0.999; + } + + PADDLE_ENFORCE(idx < sparsity.size()); + return sparsity[idx]; +} + +template <typename DeviceContext, typename T> +class DGCOpKernel : public framework::OpKernel<T> { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto u = ctx.Input<framework::Tensor>("U"); + auto v = ctx.Input<framework::Tensor>("V"); + auto g = ctx.Input<framework::Tensor>("Grad"); + + // attrs + float m = ctx.Attr<float>("m"); + bool use_nesterov = ctx.Attr<bool>("use_nesterov"); + auto sparsity = ctx.Attr<std::vector<float>>("sparsity"); + auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step"); + auto rampup_step = ctx.Attr<float>("rampup_step"); + + // current step + auto current_step_tensor = ctx.Input<framework::Tensor>("current_step"); + const float* current_step = current_step_tensor->data<float>(); + + if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc"; + return; + } + + float ratio = + 1 - get_period_sparcity(sparsity, static_cast<float>(*current_step), + rampup_step); + PADDLE_ENFORCE(ratio > 0.0 && ratio < 1.0); + int k = static_cast<int>(g->numel() * ratio); + + VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov + << ", rampup_begin_step:" << rampup_begin_step + << ", rampup_step:" << rampup_step + << ", current_step:" << *current_step << ", ratio:" << ratio + << ", k:" << k; + + auto k_out = ctx.Output<framework::Tensor>("k"); + T* k_out_data = k_out->data<T>(); + *k_out_data = k; + + auto u_out = ctx.Output<framework::Tensor>("U_out"); + auto v_out = ctx.Output<framework::Tensor>("V_out"); + auto encode_grad_out = ctx.Output<framework::Tensor>("EncodeGrad"); + + // FIXME(gongwb): use cublas. + auto u_out_e = framework::EigenVector<T>::Flatten(*u_out); + auto u_e = framework::EigenVector<T>::Flatten(*u); + auto g_e = framework::EigenVector<T>::Flatten(*g); + auto& dev_ctx = ctx.template device_context<DeviceContext>(); + auto& eigen_ctx = *dev_ctx.eigen_device(); + if (use_nesterov) { + // u = m * (u + g) + u_out_e.device(eigen_ctx) = m * (u_e + g_e); + + // v = u + v + g + ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>( + ctx, u, v, 0, AddFunctor<T>(), v_out); + + ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>( + ctx, g, v, 0, AddFunctor<T>(), v_out); + } else { + // u = m * u + g + u_out_e.device(eigen_ctx) = m * u_e + g_e; + + // v = u + v + ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>( + ctx, u, v, 0, AddFunctor<T>(), v_out); + } + + T* v_out_data = v_out->mutable_data<T>(ctx.GetPlace()); + T* u_out_data = u_out->mutable_data<T>(ctx.GetPlace()); + T* encode_grad_out_data = encode_grad_out->mutable_data<T>( + framework::DDim{2 * k}, ctx.GetPlace()); + + int buf_size = paddle::communication::dgc::get_buffer_size(k); + auto& allocator = platform::DeviceTemporaryAllocator::Instance().Get( + ctx.GetPlace(), dev_ctx.stream()); + auto tmp_ious_data = allocator.Allocate(buf_size); + void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr()); + + if (!paddle::communication::dgc::k_select( + static_cast<void*>(encode_grad_out_data), k, v_out_data, + static_cast<int>(v_out->numel()), buf, dev_ctx.stream(), + u_out_data)) { + LOG(FATAL) << "v_out numel:" << v_out->numel(); + } + + auto grad_out = ctx.Output<framework::Tensor>("Grad_out"); + math::SetConstant<DeviceContext, T> tset; + tset(dev_ctx, grad_out, static_cast<T>(0)); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 9220d35707..c3db59563f 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -46,8 +46,9 @@ cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper) IF(WITH_GPU) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) + set(dgc_deps dgc) ELSE() - set(GPU_CTX_DEPS) + set(dgc_deps) ENDIF() IF(WITH_MKLDNN) @@ -68,7 +69,8 @@ ENDIF() # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} - place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} temp_allocator) + place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} + temp_allocator ${dgc_deps}) if(WIN32) if(WITH_GPU AND NOT WITH_DSO) diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 2e8fa7c1b8..497c7b3c87 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -37,13 +37,13 @@ limitations under the License. */ } \ } while (0) -#define PADDLE_ASSERT_MSG_CODE(e, m, c) \ - do { \ - if (!(e)) { \ - printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \ - TOSTRING(e), m, c); \ - asm("trap;"); \ - } \ +#define PADDLE_ASSERT_MSG_CODE(e, m, c) \ + do { \ + if (!(e)) { \ + printf("%s:%d Assertion `%s` failed (%s %ld).\n", __FILE__, __LINE__, \ + TOSTRING(e), m, c); \ + asm("trap;"); \ + } \ } while (0) #else #include <assert.h> diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 48002a7620..61386bdf05 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "glog/logging.h" + namespace paddle { namespace platform { @@ -324,8 +326,17 @@ void CUDADeviceContext::Wait() const { auto& allocator = DeviceTemporaryAllocator::Instance().Get<CUDADeviceContext>(*this); allocator.Release([this]() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - PADDLE_ENFORCE(cudaGetLastError()); + cudaError_t e_sync = cudaStreamSynchronize(stream_); + if (e_sync != 0) { + LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync) + << " errno:" << e_sync; + } + + cudaError_t e_get = cudaGetLastError(); + if (e_get != 0) { + LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get) + << " errno:" << e_get; + } }); } diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index d53a4029e1..407d1b1299 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -31,6 +31,10 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "dgc/dgc.h" +#endif + DEFINE_int32(paddle_num_threads, 1, "Number of threads for each paddle instance."); DEFINE_int32(multiple_of_cupti_buffer_size, 1, @@ -43,6 +47,10 @@ namespace framework { std::once_flag gflags_init_flag; std::once_flag p2p_init_flag; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +std::once_flag dgc_init_flag; +#endif + void InitGflags(std::vector<std::string> argv) { std::call_once(gflags_init_flag, [&]() { FLAGS_logtostderr = true; @@ -203,5 +211,15 @@ void InitGLOG(const std::string &prog_name) { #endif } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void InitDGC() { + std::call_once(dgc_init_flag, []() { + PADDLE_ENFORCE(paddle::communication::dgc::dynloadNcclLib()); + }); +} +#else +void InitDGC() {} +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index 0e30594672..01d66f57dc 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -30,5 +30,7 @@ void InitDevices(bool init_p2p); void InitDevices(bool init_p2p, const std::vector<int> devices); +void InitDGC(); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 7b5e417504..31b5dd5d7c 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -222,6 +222,7 @@ void BindOpDesc(pybind11::module *m) { .def("attr_type", &pd::OpDesc::GetAttrType) .def("attr_names", &pd::OpDesc::AttrNames) .def("_set_attr", &pd::OpDesc::SetAttr) + .def("remove_attr", &pd::OpDesc::RemoveAttr) .def("attr", &pd::OpDesc::GetAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index dca40edf0b..3b0939ef82 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -933,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_gflags", framework::InitGflags); m.def("init_glog", framework::InitGLOG); + m.def("init_dgc", framework::InitDGC); m.def("init_devices", [](bool init_p2p) { framework::InitDevices(init_p2p); }); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 85e1916a3a..4a5301b436 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1202,6 +1202,9 @@ class Operator(object): """ self._update_desc_attr(name, val) + def _remove_attr(self, name): + self.desc.remove_attr(name) + def _update_desc_attr(self, name, val): """ Update the value of desc's attribute by attribute's name. @@ -2725,6 +2728,10 @@ class Program(object): self._trainers_endpoints = [] # the distributed lookup table names self._distributed_lookup_table = None + + # use Deep gradient comrepssion or not + self._enable_dgc = False + # @deprecated(the python memory optimize transpiler is deprecated) # whether the program is optimized by memory_optimize_transpiler self.__is_mem_optimized = False @@ -2775,6 +2782,15 @@ class Program(object): def set_op_role_var(self, var_name): self._op_role_var = [var_name] + @contextlib.contextmanager + def _backward_role_guard(self): + tmp_role = self._current_role + + OpRole = core.op_proto_and_checker_maker.OpRole + self._current_role = OpRole.Backward + yield + self._current_role = tmp_role + @signature_safe_contextmanager def _optimized_guard(self, param_and_grads): """ diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index c0deb5eacc..e21f303a3e 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -17,7 +17,7 @@ from __future__ import print_function from collections import defaultdict from .wrapped_decorator import signature_safe_contextmanager -from paddle.fluid.framework import Program, Variable, name_scope, default_main_program +from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from . import framework @@ -31,13 +31,17 @@ from .layer_helper import LayerHelper from .layers import ops from .regularizer import append_regularization_ops from .imperative import base as imperative_base +from paddle.fluid import core +from paddle.fluid.layers import tensor +from functools import reduce +import copy __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum', - 'LarsMomentumOptimizer' + 'LarsMomentumOptimizer', 'DGCMomentumOptimizer' ] @@ -294,6 +298,9 @@ class Optimizer(object): outputs={"ParamOut": param_and_grad[0]}) return new_param_grads, (table_param, table_grad), sgd_op + def _append_dgc_ops(self, param_and_grad): + pass + def backward(self, loss, startup_program=None, @@ -415,6 +422,9 @@ class Optimizer(object): with program_guard(program, startup_program): params_grads = self.backward(loss, startup_program, parameter_list, no_grad_set) + # Note: since we can't use all_reduce_op now, + # dgc_op should be the last op of one grad. + self._append_dgc_ops(params_grads) optimize_ops = self.apply_gradients(params_grads) return optimize_ops, params_grads @@ -552,6 +562,264 @@ class MomentumOptimizer(Optimizer): return momentum_op +class DGCMomentumOptimizer(MomentumOptimizer): + """ + + Original paper is https://arxiv.org/abs/1712.01887 + + DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\ + only gradients larger than a threshold are transmitted. + + To avoid losing information, DGC accumulate the rest of the gradients locally. + + Eventually, these gradients become large enough to be transmitted. + + Thus, DGC send the large gradients immediately but eventually send all of the gradients over time. + + To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance. + + DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. + + This optimizer will do two things: + + 1. Compress the gradient by get TopK import value from tensor \ + and use it for allreduce to reduce network bandwidth. + + 2. Call momentum to optimize on the cost. + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + momentum (float): Momentum factor. + rampup_begin_step (int): The begining step from which gradient compression is implemented. + rampup_step (int): How long it use the sparsity periods. Default is 1. + for example: If the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 5, \ + it will use 0.75 at 0 step, and 0.9375 at 1 step, and so on. And when reach sparsity array ends, \ + it will use 0.999 then and after. + sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). + use_nesterov (bool): Enables Nesterov momentum. True means use nesterov. + local_grad_clip_norm (float): Clip norm value if needed. + num_trainers: The number of training node. + regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.DGCMomentumOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + rampup_begin_step=1252, + regularization=fluid.regularizer.L2Decay(1e-4)) + optimizer.minimize(cost) + + """ + + def __init__(self, + learning_rate, + momentum, + rampup_begin_step, + rampup_step=1, + sparsity=[0.999], + use_nesterov=False, + local_grad_clip_norm=None, + num_trainers=None, + regularization=None, + name=None): + self._sparsity = sparsity + self._rampup_step = rampup_step + self._rampup_step_var = None + + self._rampup_begin_step = rampup_begin_step + self._rampup_begin_step_var = None + + self._global_step_var = None + self._local_grad_clip_norm = None + self._clip_norm = None + + if local_grad_clip_norm is not None: + assert isinstance(num_trainers, int) + assert isinstance(local_grad_clip_norm, float) + assert num_trainers > 0 + + self._local_grad_clip_norm = local_grad_clip_norm + self._num_trainers = num_trainers + self._clip_norm = local_grad_clip_norm / (num_trainers * + num_trainers) + + super(DGCMomentumOptimizer, self).__init__( + learning_rate, momentum, use_nesterov, regularization, name) + + core.init_dgc() + + def _add_auto_increment_var(self, counter_name, begin, step=1): + helper = LayerHelper('global_step_counter') + counter, is_new_var = helper.create_or_get_global_variable( + name=counter_name, dtype='float32', shape=[1], persistable=True) + if is_new_var: + helper.set_variable_initializer( + counter, + initializer=Constant( + value=float(begin - 1), force_cpu=True)) + helper.main_program.global_block()._prepend_op( + type='increment', + inputs={'X': [counter]}, + outputs={'Out': [counter]}, + attrs={'step': float(step)}, + stop_gradient=True) + counter.stop_gradient = True + + return counter + + def _append_dgc_ops(self, param_and_grads): + start_program = default_startup_program() + main_program = default_main_program() + main_program._enable_dgc = True + + # step counter + self._global_step_var = self._add_auto_increment_var( + counter_name='__g_dgc_counter__', begin=0) + + # rampup begin step var for all_reduce_op_handle + self._rampup_begin_step_var = tensor.create_global_var( + shape=[1], + dtype=core.VarDesc.VarType.FP32, + persistable=True, + name='__g_rampup_begin_step__', + value=self._rampup_begin_step * 1.0, + force_cpu=True) + + for param_var, grad_var in param_and_grads: + var_numel = reduce(lambda x, y: x * y, param_var.shape) + if var_numel < 16384 or \ + param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ + grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ + param_var.dtype != core.VarDesc.VarType.FP32 : + continue + + u_var = tensor.create_global_var( + shape=param_var.shape, + dtype=param_var.dtype, + persistable=True, + name=param_var.name + "__dgc_u__", + value=0.0) + v_var = tensor.create_global_var( + shape=param_var.shape, + dtype=param_var.dtype, + persistable=True, + name=param_var.name + "__dgc_v__", + value=0.0) + + k_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + "__dgc_k__", + value=0.0, + force_cpu=True) + + encoded_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + "__dgc_encoded__", + value=0.0, + force_cpu=False) + + # del back oprolevarname + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + for op in main_program.global_block().ops: + if not self._is_the_backward_op(op): + continue + + var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] + if param_var.name not in var_attr: + continue + + var_attr.remove(param_var.name) + var_attr.remove(grad_var.name) + if len(var_attr) > 1: + op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) + else: + op._remove_attr(op_maker.kOpRoleVarAttrName()) + + clip_var = grad_var + if self._local_grad_clip_norm is not None: + clip_var = self._append_clip_norm(grad_var, self._clip_norm) + self._dgc_op(param_var, clip_var, grad_var, u_var, v_var, k_var, + encoded_var) + + def _is_the_backward_op(self, op): + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + if op_maker.kOpRoleVarAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward): + return True + return False + + def _clip_by_norm(self, x, max_norm, name=None): + args = {'x': x, 'max_norm': max_norm, 'name': name} + + helper = LayerHelper("dgc_clip_by_norm_op", **args) + + if name is None: + name = unique_name.generate(".".join([helper.name, 'tmp'])) + + out = helper.create_variable( + type=x.type, name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="clip_by_norm", + inputs={"X": x, + "current_step": self._global_step_var}, + attrs={ + "max_norm": max_norm, + "rampup_begin_step": float(self._rampup_begin_step) + }, + outputs={"Out": out}) + return out + + def _append_clip_norm(self, grad_var, clip_norm): + with grad_var.block.program._backward_role_guard(): + return self._clip_by_norm( + x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC") + + def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var, + encoded_var): + block = framework.default_main_program().global_block() + op_maker = core.op_proto_and_checker_maker + dgc_op = block.append_op( + type="dgc", + inputs={ + "U": u_var, + "V": v_var, + "Grad": clip_var, + "current_step": self._global_step_var + }, + outputs={ + "U_out": u_var, + "V_out": v_var, + "EncodeGrad": encoded_var, + "k": k_var, + "Grad_out": grad_var + }, + attrs={ + "m": self._momentum, + "sparsity": self._sparsity, + "use_nesterov": self._use_nesterov, + "rampup_begin_step": float(self._rampup_begin_step), + "rampup_step": float(self._rampup_step) + }, + stop_gradient=True) + + backward = op_maker.OpRole.Backward + dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward) + dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), + [param_var.name, grad_var.name]) + + class LarsMomentumOptimizer(Optimizer): """ Momentum optimizer with LARS support diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 6702fc808b..6b88e7a99f 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -103,6 +103,12 @@ class ParallelExecutor(object): ) if use_cuda else framework.cpu_places() self._scope = scope if scope is not None else executor.global_scope() + if main_program is not None and main_program._enable_dgc: + assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce + assert num_trainers * len( + self._places) > 1, "dgc is not useful for single card training" + assert use_cuda + main_program = main_program if main_program is not None \ else framework.default_main_program() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index cefa2b4919..d139feac6f 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) +list(REMOVE_ITEM TEST_OPS test_dgc_op) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl) list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) @@ -97,6 +98,7 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + py_test_modules(test_dgc_op MODULES test_dgc_op) set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl) set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000) @@ -107,16 +109,20 @@ if(WITH_DISTRIBUTE) endif(NOT APPLE) # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() + py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) + if(NOT WIN32) -py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL) + py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL) endif() + if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() + if(CMAKE_BUILD_TYPE STREQUAL "Debug") # change the timeout from 600 to 2200, because in debug mode, this test need more time. set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200) diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 1c45a10a9d..c598260e13 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -73,7 +73,7 @@ def cnn_model(data): class TestDistMnist2x2(TestDistRunnerBase): - def get_model(self, batch_size=2): + def get_model(self, batch_size=2, use_dgc=False): # Input data images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) label = fluid.layers.data(name='label', shape=[1], dtype='int64') @@ -93,7 +93,11 @@ class TestDistMnist2x2(TestDistRunnerBase): # TODO(typhoonzero): fix distributed adam optimizer # opt = fluid.optimizer.AdamOptimizer( # learning_rate=0.001, beta1=0.9, beta2=0.999) - opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) + if not use_dgc: + opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) + else: + opt = fluid.optimizer.DGCMomentumOptimizer( + learning_rate=self.lr, momentum=0.9, rampup_begin_step=0) # Reader train_reader = paddle.batch( diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index c3d84dba0a..a2fd61e238 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -210,7 +210,7 @@ class SE_ResNeXt(): class DistSeResneXt2x2(TestDistRunnerBase): - def get_model(self, batch_size=2): + def get_model(self, batch_size=2, use_dgc=False): # Input data image = fluid.layers.data( name="data", shape=[3, 224, 224], dtype='float32') @@ -237,11 +237,19 @@ class DistSeResneXt2x2(TestDistRunnerBase): base_lr = 0.1 lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) + if not use_dgc: + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + else: + optimizer = fluid.optimizer.DGCMomentumOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + rampup_begin_step=0, + regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) # Reader diff --git a/python/paddle/fluid/tests/unittests/test_dgc_op.py b/python/paddle/fluid/tests/unittests/test_dgc_op.py new file mode 100644 index 0000000000..04766dd858 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dgc_op.py @@ -0,0 +1,138 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid + +g_array_size = 102400 + + +class TestDGCOp(unittest.TestCase): + def setup(self, place, array_size=g_array_size): + size = array_size + np.random.seed(5) # fix seed + + self.scope = fluid.global_scope() + self.place = place + print("place:", place) + + # numpy data + # inputs: U, V, Grad, current_step + self.u_name = "U" + self.u = np.random.random(size).astype("float32") + + self.v_name = "V" + self.v = np.random.random(size).astype("float32") + + self.grad_name = "Grad" + self.grad = np.random.random(size).astype("float32") + + self.current_step_name = "current_step" + self.current_step = np.full((1), 0.0).astype("float32") + + # output: U_out, V_out, EncodeGrad, GradLocal_out + self.encode_grad_name = "EncodeGrad" + self.k_name = "k" + self.k = np.full((1), 0.0).astype("float32") + + # scope data + self.u_tensor = self.scope.var(self.u_name).get_tensor() + self.u_tensor.set(self.u, place) + + self.v_tensor = self.scope.var(self.v_name).get_tensor() + self.v_tensor.set(self.v, place) + + self.grad_tensor = self.scope.var(self.grad_name).get_tensor() + self.grad_tensor.set(self.grad, place) + + self.encode_grad_tensor = self.scope.var( + self.encode_grad_name).get_tensor() + + self.current_step_tensor = self.scope.var( + self.current_step_name).get_tensor() + self.current_step_tensor.set(self.current_step, core.CPUPlace()) + + self.k_tensor = self.scope.var(self.k_name).get_tensor() + self.k_tensor.set(self.k, core.CPUPlace()) + + def check(self, actual_t, expect_t, place, out_name, atol=1e-5): + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol), + "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + + str(expect_t) + "\n" + "But Got" + str(actual_t)) + + def test_run_and_check(self): + self.setup(place=core.CUDAPlace(0)) + kwargs = { + # inputs + 'U': self.u_name, + 'V': self.v_name, + 'Grad': self.grad_name, + 'current_step': self.current_step_name, + + # outputs + 'U_out': self.u_name, + 'V_out': self.v_name, + 'EncodeGrad': self.encode_grad_name, + 'Grad_out': self.grad_name, + 'k': self.k_name, + + # attrs + 'm': 0.9, + 'sparsity': [0.75, 0.9375, 0.984375, 0.996, 0.999], + 'use_nesterov': True, + 'rampup_begin_step': float(0.0), + 'rampup_step': float(10.0), + } + + dgc_op = Operator('dgc', **kwargs) + + #atol = 1e-6 + dgc_op.run(self.scope, self.place) + + u_out = np.array(self.u_tensor) + v_out = np.array(self.v_tensor) + grad_out = np.array(self.grad_tensor) + encode_grad_out = np.array(self.encode_grad_tensor) + k = int(np.array(self.k_tensor)[0]) + + print("u_out:", u_out[0:20]) + print("v_out:", v_out[0:20]) + print("encode_grad_out:", encode_grad_out) + print("k_out:", k) + + self.assertEqual(k, int(g_array_size * 0.25)) + + index = encode_grad_out[0:k].view(dtype=np.int32) + value = encode_grad_out[k:2 * k] + + acl = 1e-7 + + for i in range(0, k): + self.assertAlmostEqual(u_out[index[i]], 0.0) + self.assertAlmostEqual(v_out[index[i]], 0.0) + + a_min = np.amin(value) + dangling = [x for x in v_out if x > a_min] + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 969f5cb63c..9c0efe6d90 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -36,7 +36,8 @@ class TestDistRunnerBase(object): def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1, - single_device=False): + single_device=False, + use_dgc=False): raise NotImplementedError( "get_model should be implemented by child classes.") @@ -82,6 +83,9 @@ class TestDistRunnerBase(object): if args.nccl2_reduce_layer_local_run: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, single_device=True) + elif args.use_dgc: + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ + self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc) else: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size) @@ -200,6 +204,7 @@ def runtime_main(test_class): parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--mem_opt', action='store_true') parser.add_argument('--use_cuda', action='store_true') + parser.add_argument('--use_dgc', action='store_true') parser.add_argument('--use_reduce', action='store_true') parser.add_argument('--dc_asgd', action='store_true') parser.add_argument( @@ -235,6 +240,7 @@ class TestDistBase(unittest.TestCase): def _after_setup_config(self): if self._enforce_place == "CPU": self.__use_cuda = False + self._use_dgc = False elif self._enforce_place == "GPU": self.__use_cuda = True else: @@ -242,6 +248,10 @@ class TestDistBase(unittest.TestCase): self.__use_cuda = True else: self.__use_cuda = False + self._use_dgc = False + + if self._use_reduce: + assert not self._use_dgc def setUp(self): self._trainers = 2 @@ -264,6 +274,7 @@ class TestDistBase(unittest.TestCase): # test, reduce check this argument everywhere. self._nccl2_reduce_layer = False self._lr = 0.001 + self._use_dgc = False self._setup_config() self._after_setup_config() @@ -506,6 +517,9 @@ class TestDistBase(unittest.TestCase): env0 = {'CPU_NUM': '1'} env1 = {'CPU_NUM': '1'} + if self._use_dgc: + tr0_cmd += " --use_dgc" + tr1_cmd += " --use_dgc" if self._mp_mode: env0 = {"FLAGS_selected_gpus": "0"} env1 = {"FLAGS_selected_gpus": "1"} diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 030860ec79..b9d2f6db39 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -39,6 +39,20 @@ class TestDistMnistNCCL2(TestDistBase): self.check_with_place("dist_mnist.py", delta=1e-5) +class TestDistMnistNCCL2DGC(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + self._use_dgc = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1e-5) + + class TestDistMnist2x2Lars(TestDistBase): def _setup_config(self): self._sync_mode = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index 28602d3251..4e9ca01f43 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -60,5 +60,20 @@ class TestDistSeResneXt2x2Async(TestDistBase): self.check_with_place("dist_se_resnext.py", delta=100) +class TestDistSeResnetNCCL2DGC(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + self._use_dgc = True + + @skip_ci + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_se_resnext.py", delta=30) + + if __name__ == "__main__": unittest.main() From 174d0d0b90a610807d6f82927aad4def227ee643 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 28 Mar 2019 08:52:08 +0800 Subject: [PATCH 11/19] Revert "Fix allocator bug" add include headers to fix travis-ci test=develop --- paddle/fluid/framework/operator.h | 3 + paddle/fluid/memory/allocation/CMakeLists.txt | 23 +++-- .../memory/allocation/aligned_allocator.h | 2 - paddle/fluid/memory/allocation/allocator.cc | 14 +-- paddle/fluid/memory/allocation/allocator.h | 72 ++------------- .../memory/allocation/allocator_facade.cc | 48 ++++------ .../memory/allocation/allocator_strategy.cc | 14 +-- .../memory/allocation/best_fit_allocator.cc | 2 +- .../memory/allocation/best_fit_allocator.h | 2 +- .../memory/allocation/buffered_allocator.cc | 22 +++-- .../memory/allocation/buffered_allocator.h | 6 +- .../allocation/buffered_allocator_test.cc | 3 +- .../fluid/memory/allocation/cpu_allocator.cc | 28 +++--- .../fluid/memory/allocation/cpu_allocator.h | 10 +- .../fluid/memory/allocation/cuda_allocator.cc | 10 +- .../fluid/memory/allocation/cuda_allocator.h | 9 +- .../memory/allocation/legacy_allocator.cc | 52 +++++------ .../memory/allocation/legacy_allocator.h | 2 +- .../memory/allocation/locked_allocator.cc | 19 ++-- .../memory/allocation/locked_allocator.h | 6 +- .../naive_best_fit_allocator_facade_test.cc | 91 ------------------- .../memory/allocation/pinned_allocator.cc | 9 +- .../memory/allocation/pinned_allocator.h | 8 +- .../memory/allocation/retry_allocator.cc | 18 +++- .../fluid/memory/allocation/retry_allocator.h | 23 +++-- .../memory/allocation/zero_size_allocator.cc | 11 +-- .../memory/allocation/zero_size_allocator.h | 7 +- paddle/fluid/platform/temporary_allocator.cc | 27 ++++-- paddle/fluid/platform/temporary_allocator.h | 14 ++- paddle/fluid/pybind/pybind.cc | 1 - paddle/fluid/string/printf.h | 6 +- 31 files changed, 224 insertions(+), 338 deletions(-) delete mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 6d8ba430bd..a02e53dcf7 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -365,6 +365,9 @@ class ExecutionContext { auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>( allocation_ptr, deleter); + PADDLE_ENFORCE( + dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); PADDLE_ENFORCE_GE(allocation_ptr->size(), framework::product(dim) * sizeof(T)); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 0f6014ae8a..ac77c3d2a5 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) -cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) if (WITH_GPU) @@ -38,20 +37,30 @@ else () set(AllocatorFacadeDeps) endif() -list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator) - cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) +cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator) -cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) +cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS + ${AllocatorFacadeDeps} + cpu_allocator + locked_allocator + best_fit_allocator + aligned_allocator + auto_increment_allocator + zero_size_allocator + conditional_allocator + retry_allocator + buffered_allocator + allocator_strategy + legacy_allocator + ) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator) -cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade) - cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index b536d4276e..064acd06e7 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator { underlying_allocator_->Allocate(size + kAlignment, attr); return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size); } - - void FreeImpl(Allocation* allocation) override { delete allocation; } }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 5a5253d911..8fb8a5fb89 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; } AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { auto ptr = AllocateImpl(size, attr); - ptr->RegisterDecoratedAllocator(this); + ptr->set_allocator(this); return AllocationPtr(ptr); } -void Allocator::FreeImpl(Allocation* allocation) { - Allocator* allocator = allocation->TopDecoratedAllocator(); - allocator->Free(allocation); -} - -void Allocator::Free(Allocation* allocation) { - allocation->PopDecoratedAllocator(); - FreeImpl(allocation); -} +void Allocator::Free(Allocation* allocation) { delete allocation; } const char* BadAlloc::what() const noexcept { return msg_.c_str(); } void AllocationDeleter::operator()(Allocation* allocation) const { - Allocator* allocator = allocation->TopDecoratedAllocator(); + auto* allocator = allocation->allocator(); allocator->Free(allocation); } diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 33b816b908..3465278935 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -46,56 +46,13 @@ class Allocator; // NOTE: this is the base class of Allocation. Each allocator can use its own // allocation object. // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0 - -/** - * Allocation is returned by Allocator::Allocate() method. - * - * An allocator may be decorated by another allocator. For example, we can - * decorate - * a RetryAllocator to any allocator to perform allocation retry when first - * allocation request fails. - * - * Explanations of Allocator design is as follows: - * - * Suppose we have an allocator which is decorated by several allocators: - * - * A(1) <- A(2) <- A(3) <- ... <- A(n) - * - * , and the public allocator is A(1). - * - * The allocation process would be: - * - * A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate() - * - * , and the free process would be: - * - * A(1).Free() -> A(2).Free() -> ... -> A(n).Free() - * - * Therefore, we should record the allocator chain when allocating, so - * that we can free the allocation in the reverse order of allocator chain. - * The field `decorated_allocators_` is used to record this chain. - * - * Another example is that we want to add additional fields in Allocation, - * e.g., something what is done in AlignedAllocator, etc. - * In this case, we should declare a derived class of Allocation, which - * contains an underlying Allocation allocated by the underlying allocator. - * Therefore, `decorated_allocators_` of the new Allocation object would - * be a new chain, differing from the underlying Allocation object. - */ class Allocation { public: Allocation(void* ptr, size_t size, platform::Place place) - : ptr_(ptr), size_(size), place_(place) { - // NOTE(zjl): Since decorated_allocators_ is usually a small vector - // We reserve a small buffer to it to prevent frequent heap allocation - // Not quite sure whether we need something like gtl vector. - decorated_allocators_.reserve(8); - } + : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {} Allocation(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete; - Allocation(Allocation&& o) = delete; - Allocation& operator=(Allocation&& o) = delete; // Returns the holding pointer. // NOTE: For performance consideration, it is better not to make this method @@ -117,31 +74,17 @@ class Allocation { const platform::Place& place() const { return place_; } - virtual ~Allocation(); - - private: - const std::vector<Allocator*>& DecoratedAllocators() const { - return decorated_allocators_; - } - - inline void RegisterDecoratedAllocator(Allocator* allocator) { - decorated_allocators_.push_back(allocator); - } + Allocator* allocator() { return allocator_; } - inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); } + void set_allocator(Allocator* allocator) { allocator_ = allocator; } - inline Allocator* TopDecoratedAllocator() { - return decorated_allocators_.back(); - } + virtual ~Allocation(); private: + Allocator* allocator_; void* ptr_; size_t size_; platform::Place place_; - std::vector<Allocator*> decorated_allocators_; - - friend class Allocator; - friend class AllocationDeleter; }; using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>; @@ -191,12 +134,9 @@ class Allocator { // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; - // This function should not be called outside - void Free(Allocation* allocation); - protected: + virtual void Free(Allocation* allocation); virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0; - virtual void FreeImpl(Allocation* allocation); private: friend class AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 09328aded5..a3b73e3ba3 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -49,17 +49,6 @@ namespace paddle { namespace memory { namespace allocation { -static inline std::shared_ptr<Allocator> WrapRetryAllocator( - std::shared_ptr<Allocator> allocator, int64_t retry_time) { - if (retry_time > 0) { - auto* retry_allocator = - new RetryAllocator(std::move(allocator), retry_time); - allocator.reset(retry_allocator); - } - - return allocator; -} - // TODO(yy): Dirty code here. This class should be configurable in runtime. class CPUManagedAllocator : public Allocator { public: @@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator { std::shared_ptr<Allocator> CreateAllocatorWithChunk() { chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_)); auto* allocation = chunks_.back().get(); - std::shared_ptr<Allocator> allocator(new LockedAllocator( - std::shared_ptr<Allocator>(new BestFitAllocator(allocation)))); + std::unique_ptr<Allocator> allocator(new LockedAllocator( + std::unique_ptr<Allocator>(new BestFitAllocator(allocation)))); - allocator = WrapRetryAllocator(allocator, retry_time_); + if (retry_time_ > 0) { + auto* retry_allocator = + new RetryAllocator(std::move(allocator), retry_time_); + allocator.reset(retry_allocator); + } return std::make_shared<AlignedAllocator<64u>>(std::move(allocator)); } @@ -197,23 +190,13 @@ class AllocatorFacadePrivate { ~AllocatorFacadePrivate() = default; AllocatorFacadePrivate() { - auto strategy = GetAllocatorStrategy(); - switch (strategy) { - case AllocatorStrategy::kLegacy: { - InitLegacyAllocator(); - break; - } - case AllocatorStrategy::kNaiveBestFit: { - InitCPUAllocator(); - InitCUDAAllocator(); - InitCUDAPinnedAllocator(); - WrapZeroSizeAllocator(); - break; - } - default: { - PADDLE_THROW("Unsupported allocator strategy: %d", - static_cast<int>(strategy)); - } + if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) { + InitLegacyAllocator(); + } else { + InitCPUAllocator(); + InitCUDAAllocator(); + InitCUDAPinnedAllocator(); + WrapZeroSizeAllocator(); } } @@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() { std::shared_ptr<Allocation> AllocatorFacade::AllocShared( const platform::Place& place, size_t size, Allocator::Attr attr) { - return std::shared_ptr<Allocation>(Alloc(place, size, attr)); + return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(), + AllocationDeleter()); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index fff94c01e7..8cebda9005 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -19,22 +19,16 @@ DEFINE_string( allocator_strategy, "legacy", "The allocation strategy. Legacy means the original allocator of Fluid." - "naive_best_fit means the experimental best fit allocator. " - "allocator. Enum in [legacy, naive_best_fit]."); + "New means the experimental allocators of Fluid. in [legacy, new]"); namespace paddle { namespace memory { namespace allocation { static AllocatorStrategy GetStrategyFromFlag() { - if (FLAGS_allocator_strategy == "legacy") { - return AllocatorStrategy::kLegacy; - } else if (FLAGS_allocator_strategy == "naive_best_fit") { - return AllocatorStrategy::kNaiveBestFit; - } else { - PADDLE_THROW("Unsupported allocator strategy: %s", - FLAGS_allocator_strategy); - } + return FLAGS_allocator_strategy == "legacy" + ? AllocatorStrategy::kLegacy + : AllocatorStrategy::kNaiveBestFit; } AllocatorStrategy GetAllocatorStrategy() { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index d87dd9a4b6..e3d6c2f511 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const { } return num; } -void BestFitAllocator::FreeImpl(Allocation* allocation) { +void BestFitAllocator::Free(Allocation* allocation) { auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation); PADDLE_ENFORCE_NOT_NULL(bf_allocation, "The input allocation is not BestFitAllocation."); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index c137438c0c..4f10f2b53e 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator { void InsertFreeNode(const ListIt& it); protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc index e04c0aa34b..fc75abc9df 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -22,11 +22,11 @@ namespace paddle { namespace memory { namespace allocation { -BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator) +BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator) : underlying_allocator_(std::move(allocator)) { PADDLE_ENFORCE_NOT_NULL( underlying_allocator_, - "Underlying allocator of BufferedAllocator must not be null"); + "Underlying allocator of BufferedAllocator must be unmanaged"); if (underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } @@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) { while (!allocations_.empty()) { // free the largest auto it = --allocations_.end(); cur += it->second->size(); - underlying_allocator_->Free(it->second.release()); + delete it->second.release(); allocations_.erase(it); if (cur >= size) return; } } -bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } - -void BufferedAllocator::FreeImpl(Allocation *allocation) { +bool BufferedAllocator::IsAllocThreadSafe() const { + return this->underlying_allocator_->IsAllocThreadSafe(); +} +void BufferedAllocator::Free(Allocation *allocation) { platform::LockGuardPtr<std::mutex> guard(mtx_); allocations_.emplace(allocation->size(), AllocationPtr(allocation)); } - Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { { platform::LockGuardPtr<std::mutex> guard(mtx_); @@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (it != allocations_.end() && it->first < size * 2) { AllocationPtr result(std::move(it->second)); allocations_.erase(it); - return result.release(); + return new AllocationWithUnderlying(std::move(result)); } } try { - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } catch (BadAlloc &) { FreeCache(size); - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } } diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h index c728395705..d44a3f85be 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.h +++ b/paddle/fluid/memory/allocation/buffered_allocator.h @@ -31,7 +31,7 @@ namespace allocation { // underlying_allocator_ class BufferedAllocator : public Allocator { public: - explicit BufferedAllocator(std::shared_ptr<Allocator> allocator); + explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator); ~BufferedAllocator(); @@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator { void FreeCache(size_t size); protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr<Allocator> underlying_allocator_; + std::unique_ptr<Allocator> underlying_allocator_; std::multimap<size_t, AllocationPtr> allocations_; std::unique_ptr<std::mutex> mtx_; }; diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc index 854a117b0e..c8bd5292ca 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/buffered_allocator.h" #include <gtest/gtest.h> +#include <memory> #include <utility> #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" @@ -65,7 +66,7 @@ class StubAllocator : public Allocator { size_t GetFreeCount() const { return destruct_count_; } protected: - void FreeImpl(Allocation *allocation) override { + void Free(Allocation *allocation) override { auto *alloc = dynamic_cast<StubAllocation *>(allocation); PADDLE_ENFORCE_NOT_NULL(alloc); if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr()); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc index 90c49c87a6..cc81a6f7b8 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.cc +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -20,27 +20,25 @@ namespace paddle { namespace memory { namespace allocation { +CPUAllocation::CPUAllocation(void *ptr, size_t size) + : Allocation(ptr, size, platform::CPUPlace()) {} + bool CPUAllocator::IsAllocThreadSafe() const { return true; } -void CPUAllocator::FreeImpl(Allocation *allocation) { - void *p = allocation->ptr(); -#ifdef _WIN32 - _aligned_free(p); -#else - free(p); -#endif +void CPUAllocator::Free(Allocation *allocation) { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation)); + free(allocation->ptr()); delete allocation; } Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { - void *p; -#ifdef _WIN32 - p = _aligned_malloc(size, kAlignment); -#else - PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!", - size); -#endif - return new Allocation(p, size, platform::CPUPlace()); + void *ptr; + auto status = posix_memalign(&ptr, kAlignment, size); + if (UNLIKELY(status) != 0) { + throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d", + size, status)); + } + return new CPUAllocation(ptr, size); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 3eb1416b0e..26d3643f4e 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -31,13 +31,19 @@ namespace allocation { // // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import // an open-sourced allocator into Paddle. +class CPUAllocator; +class CPUAllocation : public Allocation { + public: + CPUAllocation(void* ptr, size_t size); +}; + class CPUAllocator : public Allocator { public: - constexpr static size_t kAlignment = 4096UL; + constexpr static size_t kAlignment = 64u; bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 895a24a6a2..430bf0be98 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -23,14 +23,15 @@ namespace paddle { namespace memory { namespace allocation { bool CUDAAllocator::IsAllocThreadSafe() const { return true; } -void CUDAAllocator::FreeImpl(Allocation* allocation) { +void CUDAAllocator::Free(Allocation* allocation) { platform::CUDADeviceGuard guard(place_.device); - PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()), + auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation); + PADDLE_ENFORCE_NOT_NULL(cuda_allocation); + PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()), place_); PADDLE_ENFORCE(cudaFree(allocation->ptr())); delete allocation; } - Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::CUDADeviceGuard guard(place_.device); void* ptr; @@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, status, cudaGetErrorString(status))); } - return new Allocation(ptr, size, platform::Place(place_)); + return new CUDAAllocation(ptr, size, platform::Place(place_)); } - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 580a2d1df1..63726f5820 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -20,6 +20,13 @@ namespace paddle { namespace memory { namespace allocation { +// CUDA System allocator and allocation. +// Just a flag type. +class CUDAAllocation : public Allocation { + public: + using Allocation::Allocation; +}; + class CUDAAllocator : public Allocator { public: explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {} @@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 0dc2de3746..514ac7883a 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -134,22 +134,26 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) { } #ifdef PADDLE_WITH_CUDA -class GPUBuddyAllocatorList { - public: - GPUBuddyAllocatorList() - : allocators_(platform::GetCUDADeviceCount()), - flags_(platform::GetCUDADeviceCount()) { - allocation::GPUMemMonitor.Initialize(allocators_.size()); - } +BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { + static std::once_flag init_flag; + static detail::BuddyAllocator **a_arr = nullptr; + static std::vector<int> devices; + + std::call_once(init_flag, [gpu_id]() { + devices = platform::GetSelectedDevices(); + int gpu_num = devices.size(); - BuddyAllocator *Get(size_t dev_id) { - PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id); - std::call_once(flags_[dev_id], [this, dev_id] { + allocation::GPUMemMonitor.Initialize(devices.size()); + + a_arr = new BuddyAllocator *[gpu_num]; + for (size_t i = 0; i < devices.size(); ++i) { + int dev_id = devices[i]; + a_arr[i] = nullptr; platform::SetDeviceId(dev_id); - allocators_[dev_id] = new BuddyAllocator( - std::unique_ptr<detail::SystemAllocator>( - new detail::GPUAllocator(dev_id)), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " @@ -163,19 +167,13 @@ class GPUBuddyAllocatorList { << FLAGS_initial_gpu_memory_in_mb << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; - }); - return allocators_[dev_id]; - } - - private: - std::vector<BuddyAllocator *> allocators_; - std::vector<std::once_flag> flags_; -}; + } + }); -BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { - static GPUBuddyAllocatorList allocators; platform::SetDeviceId(gpu_id); - return allocators.Get(gpu_id); + auto pos = std::distance(devices.begin(), + std::find(devices.begin(), devices.end(), gpu_id)); + return a_arr[pos]; } #endif @@ -194,7 +192,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, #ifdef PADDLE_WITH_CUDA auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); - if (ptr == nullptr && size > 0) { + if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); size_t avail, total; @@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { return tmp_alloc; } -void LegacyAllocator::FreeImpl(Allocation *allocation) { +void LegacyAllocator::Free(Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index 27cd42ea35..d9bdae153d 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator { protected: Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; private: platform::Place place_; diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index c43099cc88..62d768c580 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -17,7 +17,6 @@ #include <utility> #include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/platform/lock_guard_ptr.h" - namespace paddle { namespace memory { namespace allocation { @@ -25,24 +24,26 @@ namespace allocation { bool LockedAllocator::IsAllocThreadSafe() const { return true; } LockedAllocator::LockedAllocator( - std::shared_ptr<Allocator> underlying_allocator) + std::unique_ptr<Allocator> &&underlying_allocator) : underlying_allocator_(std::move(underlying_allocator)) { PADDLE_ENFORCE_NOT_NULL(underlying_allocator_); if (!underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } } - -void LockedAllocator::FreeImpl(Allocation *allocation) { - platform::LockGuardPtr<std::mutex> guard(mtx_); - underlying_allocator_->Free(allocation); +void LockedAllocator::Free(Allocation *allocation) { + { + platform::LockGuardPtr<std::mutex> guard(mtx_); + reinterpret_cast<AllocationWithUnderlying *>(allocation) + ->allocation_.reset(); // Destroy inner allocation + } + delete allocation; } - Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::LockGuardPtr<std::mutex> guard(mtx_); - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index b735ccef10..4967b9bb8d 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -24,15 +24,15 @@ namespace allocation { // A allocator to make underlying allocator thread safe. class LockedAllocator : public Allocator { public: - explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator); + explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator); bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr<Allocator> underlying_allocator_; + std::unique_ptr<Allocator> underlying_allocator_; std::unique_ptr<std::mutex> mtx_; }; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc deleted file mode 100644 index 3334589a4b..0000000000 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <gflags/gflags.h> -#include <gtest/gtest.h> -#include "paddle/fluid/memory/allocation/allocator_facade.h" - -#ifdef PADDLE_WITH_CUDA -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_double(fraction_of_cuda_pinned_memory_to_use); -DECLARE_int64(gpu_allocator_retry_time); -#endif - -DECLARE_string(allocator_strategy); - -namespace paddle { -namespace memory { -namespace allocation { - -TEST(allocator, allocator) { -#ifdef PADDLE_WITH_CUDA - FLAGS_fraction_of_gpu_memory_to_use = 0.01; - FLAGS_gpu_allocator_retry_time = 500; - FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; -#endif - - FLAGS_allocator_strategy = "naive_best_fit"; - - auto &instance = AllocatorFacade::Instance(); - platform::Place place; - size_t size = 1024; - - { - place = platform::CPUPlace(); - size = 1024; - auto cpu_allocation = instance.Alloc(place, size); - ASSERT_NE(cpu_allocation, nullptr); - ASSERT_NE(cpu_allocation->ptr(), nullptr); - ASSERT_EQ(cpu_allocation->place(), place); - ASSERT_EQ(cpu_allocation->size(), size); - } - -#ifdef PADDLE_WITH_CUDA - { - place = platform::CUDAPlace(0); - size = 1024; - auto gpu_allocation = instance.Alloc(place, size); - ASSERT_NE(gpu_allocation, nullptr); - ASSERT_NE(gpu_allocation->ptr(), nullptr); - ASSERT_EQ(gpu_allocation->place(), place); - ASSERT_GE(gpu_allocation->size(), size); - } - - { - // Allocate 2GB gpu memory - place = platform::CUDAPlace(0); - size = 2 * static_cast<size_t>(1 << 30); - auto gpu_allocation = instance.Alloc(place, size); - ASSERT_NE(gpu_allocation, nullptr); - ASSERT_NE(gpu_allocation->ptr(), nullptr); - ASSERT_EQ(gpu_allocation->place(), place); - ASSERT_GE(gpu_allocation->size(), size); - } - - { - place = platform::CUDAPinnedPlace(); - size = (1 << 20); - auto cuda_pinned_allocation = - instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); - ASSERT_NE(cuda_pinned_allocation, nullptr); - ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); - ASSERT_EQ(cuda_pinned_allocation->place(), place); - ASSERT_GE(cuda_pinned_allocation->size(), size); - } -#endif -} - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 5a3d817211..de81d12cca 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -20,15 +20,20 @@ namespace paddle { namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } -void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { +void CPUPinnedAllocator::Free(Allocation *allocation) { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation)); PADDLE_ENFORCE(cudaFreeHost(allocation->ptr())); delete allocation; } Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { + // PADDLE_ENFORCE_EQ( + // attr, kCrossDevice, + // "CPUPinnedAllocator should be used for Cross-Device Communication"); + void *ptr; PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); - return new Allocation(ptr, size, platform::CUDAPinnedPlace()); + return new CPUPinnedAllocation(ptr, size); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index deeb55a8fb..42d0938f2a 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -20,12 +20,18 @@ namespace memory { namespace allocation { // Allocator uses `cudaHostAlloc` +class CPUPinnedAllocation : public Allocation { + public: + CPUPinnedAllocation(void *ptr, size_t size) + : Allocation(ptr, size, platform::CUDAPinnedPlace()) {} +}; + class CPUPinnedAllocator : public Allocator { public: bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; }; diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 7e888988f9..981705051b 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -18,15 +18,25 @@ namespace paddle { namespace memory { namespace allocation { -void RetryAllocator::FreeImpl(Allocation* allocation) { +bool RetryAllocator::IsAllocThreadSafe() const { + return underlying_allocator_->IsAllocThreadSafe(); +} + +void RetryAllocator::Free(Allocation* allocation) { // Delete underlying allocation first. - underlying_allocator_->Free(allocation); - cv_.notify_all(); + reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset(); + { + // notify all waited allocators, they can try to allocate memory after free. + std::lock_guard<std::mutex> lock(mutex_); + cv_.notify_all(); + } + delete allocation; } Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { auto alloc_func = [&, this]() { - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); }; // In fact, we can unify the code of allocation success and failure // But it would add lock even when allocation success at the first time diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 379f576d6e..6ab8ca8fbe 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -25,25 +25,32 @@ namespace paddle { namespace memory { namespace allocation { +class RetryAllocator; + class RetryAllocator : public Allocator { public: - RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms) + RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms) : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) { + EnforceCheck(); + } + + bool IsAllocThreadSafe() const override; + + private: + void EnforceCheck() { PADDLE_ENFORCE_NOT_NULL( - underlying_allocator_, - "UnderlyingAllocator of RetryAllocator must not be null"); + underlying_allocator_.get(), + "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator"); PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(), "UnderlyingAllocator of RetryAllocator must be thread-safe"); } - bool IsAllocThreadSafe() const override { return true; } - protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr<Allocator> underlying_allocator_; + std::unique_ptr<Allocator> underlying_allocator_; std::chrono::milliseconds retry_time_; std::mutex mutex_; std::condition_variable cv_; @@ -51,6 +58,8 @@ class RetryAllocator : public Allocator { // For debug, We can add an atomic integer to record how many memory sizes are // waited to allocate // std::atomic<size_t> waited_allocate_size_{0}; + + friend class RetryAllocation; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc index 39743bcb10..cb2df1a029 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.cc +++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc @@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const { Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (size == 0) { - return new Allocation(nullptr, 0, place_); + return new ZeroSizeAllocation(place_); } else { return underlying_allocator_->Allocate(size, attr).release(); } } - -void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { - if (allocation->size() == 0) { - delete allocation; - } else { - underlying_allocator_->Free(allocation); - } -} - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h index 08a7a06dbf..0f01dfcdf5 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.h +++ b/paddle/fluid/memory/allocation/zero_size_allocator.h @@ -24,6 +24,12 @@ namespace allocation { // The allocator handles the request's size is zero. Allocator will always // return an allocation even the request size is zero. However, the // allocation.ptr() is nullptr +class ZeroSizeAllocation : public Allocation { + public: + explicit ZeroSizeAllocation(const platform::Place& p) + : Allocation(nullptr, 0, p) {} +}; + class ZeroSizeAllocator : public Allocator { public: ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator, @@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator { protected: Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; - void FreeImpl(Allocation* allocation) override; private: std::shared_ptr<Allocator> underlying_allocator_; diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index ddde7baf4c..250efe70fd 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -30,31 +30,38 @@ namespace paddle { namespace platform { namespace alloc = memory::allocation; +TemporaryAllocation::TemporaryAllocation( + alloc::AllocationPtr &&underlying_allocation) + : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)) {} + TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { - temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>()); + temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>()); } bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } void TemporaryAllocator::Release(const std::function<void()> &callback) { - std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations; + std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations; { std::unique_lock<std::mutex> lock(mtx_); callback(); t_allocations.swap(temp_mem_map_); - temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>()); + temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>()); wait_delete_mem_ = 0; } - alloc::AllocationDeleter deleter; for (auto tmp : *t_allocations) { VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() << " size: " << tmp.second->size(); - deleter(tmp.second); + delete tmp.second; } } -void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { +void TemporaryAllocator::Free(alloc::Allocation *allocation) { + auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation); + PADDLE_ENFORCE_NOT_NULL(temp_allocation); if (platform::is_gpu_place(temp_allocation->place())) { PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), "The place should be the same."); @@ -78,7 +85,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { } VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() << " size: " << temp_allocation->size(); - alloc::AllocationDeleter()(temp_allocation); + delete temp_allocation; } size_t TemporaryAllocator::TemporaryAllocationQueueSize() { @@ -113,9 +120,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl( } // If not find the the available allocation, get allocation from // AllocatorFacadeInstance. - auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); + auto raw_allocation = + alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); + auto temp_mem = new TemporaryAllocation(std::move(raw_allocation)); VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size; - return temp_mem.release(); + return temp_mem; } } // namespace platform diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index 912d45eaf1..f8a43b889d 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -23,6 +23,14 @@ namespace paddle { namespace platform { +class TemporaryAllocation : public memory::allocation::Allocation { + public: + explicit TemporaryAllocation( + memory::allocation::AllocationPtr &&underlying_allocation); + + memory::allocation::AllocationPtr underlying_allocation_; +}; + /*! \brief the TemporaryAllocator is used to alloc the temporary allocation * which used by CUDA's async operation. * @@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator { void SetCallback(const std::function<void()> &callback); protected: - void FreeImpl(memory::allocation::Allocation *allocation) override; + void Free(memory::allocation::Allocation *allocation) override; memory::allocation::Allocation *AllocateImpl( size_t size, memory::allocation::Allocator::Attr attr) override; @@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator { platform::Place place_; // When the allocation is not held by any variable, it should be placed // to temp_mem_map immediately. - std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>> - temp_mem_map_{nullptr}; + std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{ + nullptr}; std::mutex mtx_; size_t wait_delete_mem_{0}; std::function<void()> callback_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index dca40edf0b..7bf0896378 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -324,7 +324,6 @@ PYBIND11_MODULE(core, m) { [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { self.mutable_data<float>(place); }) - .def("_clear", &Tensor::clear) .def("set", PyCPUTensorSetFromArray<float>) .def("set", PyCPUTensorSetFromArray<int>) .def("set", PyCPUTensorSetFromArray<double>) diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 66b768665b..16bb3771f2 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) { Fprintf(std::cout, fmt, args...); } -inline std::string HumanReadableSize(double f_size) { +template <typename T> +std::string HumanReadableSize(T size) { size_t i = 0; + double f_size = static_cast<double>(size); double orig = f_size; const std::vector<std::string> units( {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); - while (f_size >= 1024) { + while (f_size > 1024) { f_size /= 1024; i++; } From 5656fa9f7ca278aff7319485c0d289a4ffc2f9d0 Mon Sep 17 00:00:00 2001 From: sneaxiy <sneaxiy@126.com> Date: Thu, 28 Mar 2019 09:51:19 +0800 Subject: [PATCH 12/19] fix travis ci test=develop --- paddle/fluid/platform/temporary_allocator.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index 250efe70fd..d489ed5368 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include <memory> +#include <utility> #include "paddle/fluid/memory/allocation/allocator_facade.h" DEFINE_int64(limit_of_tmp_allocation, -1, From 5ab56871386c883c3161191c85e1c7f03d51c9a1 Mon Sep 17 00:00:00 2001 From: Zhen Wang <wangzhen31@baidu.com> Date: Thu, 28 Mar 2019 10:37:34 +0800 Subject: [PATCH 13/19] remove no necessary doc changes. test=develop --- python/paddle/fluid/framework.py | 200 ++++++++++++++++++++++++++++++- 1 file changed, 198 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index a209f389f3..7abd2a23aa 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -627,6 +627,183 @@ class Variable(object): """ self.error_clip = error_clip + def _slice_indices(self, slice, length): + """ + Reference implementation for the slice.indices method. + """ + # Compute step and length as integers. + step = 1 if slice.step is None else slice.step + + # Raise ValueError for negative length or zero step. + if length < 0: + raise ValueError("length should not be negative") + if step == 0: + raise ValueError("slice step cannot be zero") + + # Find lower and upper bounds for start and stop. + lower = -1 if step < 0 else 0 + upper = length - 1 if step < 0 else length + + # Compute start. + if slice.start is None: + start = upper if step < 0 else lower + else: + start = slice.start + start = max(start + length, lower) if start < 0 else min(start, + upper) + + # Compute stop. + if slice.stop is None: + stop = lower if step < 0 else upper + else: + stop = slice.stop + stop = max(stop + length, lower) if stop < 0 else min(stop, upper) + + return start, stop, step + + def _detectEllipsis(self, item): + has_ellipsis = False + start = 0 + end = len(self.shape) + for index, o in enumerate(item): + if o is Ellipsis: + if has_ellipsis: + raise ValueError("Index can have one ellipsis only.") + has_ellipsis = True + start = index + else: + if has_ellipsis: + end = index + return has_ellipsis, start, end + + def _reconstructSliceinfo(self, item): + has_ellipsis, start, end = self._detectEllipsis(item) + if has_ellipsis: + newitem = [] + for i in range(start): + newitem.append(item[i]) + for i in range(start, end): + newitem.append(slice(None, None, None)) + for i in range(end, len(item)): + newitem.append(item[i]) + return newitem + else: + return None + + def _detectContinuesSlice(self, item): + starts = [] + ends = [] + for index, o in enumerate(item): + if isinstance(o, int): + start = int(o) + if (index > 0 and index >= self.shape[index]) \ + or (index < 0 and (index + self.shape[index]) < 0): + raise IndexError("invalid index") + start = max(start + self.shape[index], 0) if start < 0 else min( + start, self.shape[index]) + starts.append(start) + ends.append(start + 1) + elif isinstance(o, slice): + start, stop, step = self._slice_indices(o, self.shape[index]) + if step == 1 or step == -1: + starts.append(start) + ends.append(stop) + else: + return False, None + else: + raise IndexError("Valid index accept int or slice or ellipsis") + return True, [starts, ends] + + def _cloneVar(self, copy=False): + if not copy: + return self.block.create_var( + name=unique_name.generate(".".join(self.name)), + dtype=self.dtype, + persistable=self.persistable, + stop_gradient=self._stop_gradient, ) + else: + return self + + def _sliceVar(self, axes, starts, ends): + new_var = self._cloneVar() + self.block.append_op( + type="slice", + inputs={'Input': [self]}, + outputs={'Out': [new_var]}, + attrs={'axes': axes, + 'starts': starts, + 'ends': ends}) + return new_var + + def _concatVar(self, inputs, axis): + new_var = self._cloneVar() + self.block.append_op( + type="concat", + inputs={'X': inputs}, + outputs={'Out': [new_var]}, + attrs={'axis': axis, }) + return new_var + + def _sliceAndConcatVar(self, item, axis): + if isinstance(item, slice): + if self.shape[axis] < 0: + return self._cloneVar(True) + start, stop, step = self._slice_indices(item, self.shape[axis]) + if step == 1: + return self._sliceVar([axis], [start], [stop]) + else: + vars = [] + if step > 0: + while start < stop: + vars.append( + self._sliceVar([axis], [start], [start + 1])) + start += step + else: + while start > stop: + vars.append( + self._sliceVar([axis], [start], [start + 1])) + start += step + return self._concatVar(vars, axis) + elif isinstance(item, int): + if self.shape[axis] < 0: + return self._cloneVar(True) + index = int(item) + if (index > 0 and index >= self.shape[axis])\ + or (index < 0 and (index + self.shape[axis]) < 0): + raise IndexError("invalid index") + return self._sliceVar([axis], [index], [index + 1]) + else: + raise IndexError("Valid index accept int or slice or tuple") + + def __getitem__(self, item): + """ + Slice the variable. + + Args: + item(int/slice/tuple) : the index. + + Returns: + Sliced variable + """ + new_var = None + if isinstance(item, tuple): + if len(item) > len(self.shape): + raise IndexError("Too many indexes") + newitem = self._reconstructSliceinfo(item) or item + check, info = self._detectContinuesSlice(newitem) + if check: + starts = info[0] + ends = info[1] + axes = [i for i in range(len(starts))] + return self._sliceVar(axes, starts, ends) + else: + new_var = self + for index, o in enumerate(newitem): + new_var = new_var._sliceAndConcatVar(o, index) + else: + new_var = self._sliceAndConcatVar(item, 0) + return new_var + def get_all_op_protos(): """ @@ -744,7 +921,7 @@ class Operator(object): if _in_imperative_mode(): if type is None: raise ValueError( - "`type` to initilized an Operator can not be None.") + "`type` to initialized an Operator can not be None.") self.iop = core.OpBase(type) # TODO(minqiyang): remove these lines after we take apart all @@ -906,7 +1083,10 @@ class Operator(object): @property def type(self): - return self.desc.type() + if _in_imperative_mode(): + return self.iop.type + else: + return self.desc.type() def input(self, name): """ @@ -1022,6 +1202,9 @@ class Operator(object): """ self._update_desc_attr(name, val) + def _remove_attr(self, name): + self.desc.remove_attr(name) + def _update_desc_attr(self, name, val): """ Update the value of desc's attribute by attribute's name. @@ -2515,6 +2698,10 @@ class Program(object): self._trainers_endpoints = [] # the distributed lookup table names self._distributed_lookup_table = None + + # use Deep gradient comrepssion or not + self._enable_dgc = False + # @deprecated(the python memory optimize transpiler is deprecated) # whether the program is optimized by memory_optimize_transpiler self.__is_mem_optimized = False @@ -2565,6 +2752,15 @@ class Program(object): def set_op_role_var(self, var_name): self._op_role_var = [var_name] + @contextlib.contextmanager + def _backward_role_guard(self): + tmp_role = self._current_role + + OpRole = core.op_proto_and_checker_maker.OpRole + self._current_role = OpRole.Backward + yield + self._current_role = tmp_role + @signature_safe_contextmanager def _optimized_guard(self, param_and_grads): """ From ecc3088df830f8574cef7d4f859d93946e93be5c Mon Sep 17 00:00:00 2001 From: whs <wanghaoshuang@baidu.com> Date: Thu, 28 Mar 2019 10:49:26 +0800 Subject: [PATCH 14/19] Fix saving in quantization strategy. (#16474) test=develop --- .../fluid/contrib/slim/quantization/quantization_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py index 6812b4c633..c4b02166ab 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py @@ -152,7 +152,7 @@ class QuantizationStrategy(Strategy): ] if self.save_in_nodes == None: - in_vars = list(context.eval_graph.out_nodes.values()) + in_vars = list(context.eval_graph.in_nodes.values()) else: in_vars = self.save_in_nodes From 59f75ec76e8fea156e97bea8739bb3bd4e27bf87 Mon Sep 17 00:00:00 2001 From: whs <wanghaoshuang@baidu.com> Date: Thu, 28 Mar 2019 11:51:22 +0800 Subject: [PATCH 15/19] Make unitest of fsp op faster and more stable. (#16502) * Make unitest of fsp op faster and more stable. test=develop * Skip unitest of fsp op. test=develop --- python/paddle/fluid/tests/unittests/test_fsp_op.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_fsp_op.py b/python/paddle/fluid/tests/unittests/test_fsp_op.py index 6ad7418447..01991f4d36 100644 --- a/python/paddle/fluid/tests/unittests/test_fsp_op.py +++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py @@ -39,19 +39,21 @@ class TestFSPOp(OpTest): self.op_type = "fsp" self.initTestCase() - feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float32') - feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float32') + feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float64') + feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float64') self.inputs = {'X': feature_map_0, 'Y': feature_map_1} self.outputs = {'Out': fsp_matrix(feature_map_0, feature_map_1)} def initTestCase(self): - self.a_shape = (2, 16, 32, 31) - self.b_shape = (2, 28, 32, 31) + self.a_shape = (2, 3, 5, 6) + self.b_shape = (2, 4, 5, 6) + @unittest.skip("Disable temporarily.") def test_check_output(self): self.check_output() + @unittest.skip("Disable temporarily.") def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05) From ed61d67c737590ebf2819ca9770a9a6d4e294880 Mon Sep 17 00:00:00 2001 From: chengduo <zhaochengduo@baidu.com> Date: Wed, 27 Mar 2019 23:37:17 -0500 Subject: [PATCH 16/19] Fix the interface of Pass::Apply (#16484) * modify the interface of Pass::Allay test=develop * Polish code test=develop * Fix Travis CI test=develop * fix Pass::Apply interface test=develop * Fix Travis CI test=develop --- .../framework/details/all_reduce_deps_pass.cc | 5 +- .../framework/details/all_reduce_deps_pass.h | 3 +- .../alloc_continuous_space_for_grad_pass.cc | 7 +-- .../fluid/framework/details/build_strategy.cc | 17 +++---- .../fluid/framework/details/build_strategy.h | 15 +++--- .../framework/details/eager_deletion_pass.cc | 8 ++-- .../details/fuse_all_reduce_op_pass.cc | 6 +-- .../framework/details/inplace_op_pass.cc | 9 ++-- .../fluid/framework/details/inplace_op_pass.h | 3 +- .../framework/details/memory_optimize_pass.cc | 7 +-- .../framework/details/memory_optimize_pass.h | 4 +- .../modify_op_lock_and_record_event_pass.cc | 4 +- .../modify_op_lock_and_record_event_pass.h | 3 +- .../details/multi_devices_graph_check_pass.cc | 6 +-- .../details/multi_devices_graph_pass.cc | 4 +- .../details/multi_devices_graph_pass.h | 3 +- .../details/multi_devices_graph_print_pass.cc | 2 + .../details/multi_devices_graph_print_pass.h | 5 +- .../details/parallel_ssa_graph_executor.cc | 2 +- .../framework/details/reference_count_pass.cc | 5 +- .../framework/details/reference_count_pass.h | 3 +- .../details/sequential_execution_pass.cc | 4 +- .../details/sequential_execution_pass.h | 3 +- .../details/while_op_eager_deletion_pass.cc | 4 +- ...anakin_fillconstant_elementwisemul_fuse.cc | 10 ++-- .../anakin_fillconstant_elementwisemul_fuse.h | 3 +- .../framework/ir/attention_lstm_fuse_pass.cc | 9 ++-- .../framework/ir/attention_lstm_fuse_pass.h | 3 +- .../ir/conv_affine_channel_fuse_pass.cc | 24 ++++------ .../ir/conv_affine_channel_fuse_pass.h | 6 +-- .../fluid/framework/ir/conv_bn_fuse_pass.cc | 31 +++++------- paddle/fluid/framework/ir/conv_bn_fuse_pass.h | 6 +-- .../ir/conv_elementwise_add2_act_fuse.cc | 6 +-- .../ir/conv_elementwise_add2_act_fuse_pass.cc | 13 ++--- .../ir/conv_elementwise_add2_act_fuse_pass.h | 3 +- .../ir/conv_elementwise_add_act_fuse_pass.cc | 12 ++--- .../ir/conv_elementwise_add_act_fuse_pass.h | 3 +- .../ir/conv_elementwise_add_fuse_pass.cc | 10 ++-- .../ir/conv_elementwise_add_fuse_pass.h | 3 +- .../ir/embedding_fc_lstm_fuse_pass.cc | 14 +++--- .../ir/embedding_fc_lstm_fuse_pass.h | 3 +- paddle/fluid/framework/ir/fc_fuse_pass.cc | 13 +++-- paddle/fluid/framework/ir/fc_fuse_pass.h | 3 +- .../fluid/framework/ir/fc_fuse_pass_tester.cc | 2 +- paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 22 ++++----- paddle/fluid/framework/ir/fc_gru_fuse_pass.h | 6 +-- .../fluid/framework/ir/fc_lstm_fuse_pass.cc | 21 ++++---- paddle/fluid/framework/ir/fc_lstm_fuse_pass.h | 6 +-- .../framework/ir/fuse_elewise_add_act_pass.cc | 48 +++++++++---------- .../framework/ir/fuse_elewise_add_act_pass.h | 20 ++++---- .../ir/fuse_relu_depthwise_conv_pass.cc | 24 +++++----- .../ir/fuse_relu_depthwise_conv_pass.h | 6 +-- .../framework/ir/graph_to_program_pass.cc | 6 +-- .../framework/ir/graph_to_program_pass.h | 2 +- .../ir/graph_to_program_pass_test.cc | 4 +- paddle/fluid/framework/ir/graph_viz_pass.cc | 13 ++--- paddle/fluid/framework/ir/graph_viz_pass.h | 4 +- .../ir/identity_scale_op_clean_pass.cc | 8 ++-- .../ir/identity_scale_op_clean_pass.h | 3 +- .../framework/ir/infer_clean_graph_pass.cc | 10 ++-- paddle/fluid/framework/ir/is_test_pass.cc | 4 +- paddle/fluid/framework/ir/is_test_pass.h | 3 +- .../fluid/framework/ir/is_test_pass_tester.cc | 2 +- .../framework/ir/lock_free_optimize_pass.cc | 11 ++--- .../framework/ir/lock_free_optimize_pass.h | 3 +- .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc | 14 +++--- .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.h | 3 +- .../conv_bias_mkldnn_fuse_pass_tester.cc | 4 +- .../conv_elementwise_add_mkldnn_fuse_pass.cc | 12 ++--- .../conv_elementwise_add_mkldnn_fuse_pass.h | 5 +- ...elementwise_add_mkldnn_fuse_pass_tester.cc | 4 +- .../ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc | 12 ++--- .../ir/mkldnn/conv_relu_mkldnn_fuse_pass.h | 3 +- .../conv_relu_mkldnn_fuse_pass_tester.cc | 2 +- .../framework/ir/mkldnn/cpu_quantize_pass.cc | 15 +++--- .../framework/ir/mkldnn/cpu_quantize_pass.h | 3 +- .../ir/mkldnn/cpu_quantize_pass_tester.cc | 2 +- .../ir/mkldnn/cpu_quantize_placement_pass.cc | 4 +- .../ir/mkldnn/cpu_quantize_placement_pass.h | 3 +- .../cpu_quantize_placement_pass_tester.cc | 2 +- .../ir/mkldnn/cpu_quantize_squash_pass.cc | 13 ++--- .../ir/mkldnn/cpu_quantize_squash_pass.h | 3 +- .../mkldnn/cpu_quantize_squash_pass_tester.cc | 2 +- .../ir/mkldnn/depthwise_conv_mkldnn_pass.cc | 10 ++-- .../ir/mkldnn/depthwise_conv_mkldnn_pass.h | 3 +- .../depthwise_conv_mkldnn_pass_tester.cc | 2 +- .../ir/mkldnn/mkldnn_placement_pass.cc | 5 +- .../ir/mkldnn/mkldnn_placement_pass.h | 3 +- .../ir/mkldnn/mkldnn_placement_pass_tester.cc | 2 +- .../framework/ir/multi_batch_merge_pass.cc | 7 ++- .../framework/ir/multi_batch_merge_pass.h | 2 +- paddle/fluid/framework/ir/pass.cc | 14 +++--- paddle/fluid/framework/ir/pass.h | 9 ++-- paddle/fluid/framework/ir/pass_test.cc | 15 +++--- .../ir/repeated_fc_relu_fuse_pass.cc | 10 ++-- .../framework/ir/repeated_fc_relu_fuse_pass.h | 3 +- .../ir/runtime_context_cache_pass.cc | 4 +- .../framework/ir/runtime_context_cache_pass.h | 3 +- .../framework/ir/seq_concat_fc_fuse_pass.cc | 15 +++--- .../framework/ir/seq_concat_fc_fuse_pass.h | 3 +- .../ir/seqconv_eltadd_relu_fuse_pass.cc | 10 ++-- .../ir/seqconv_eltadd_relu_fuse_pass.h | 3 +- .../framework/ir/seqpool_concat_fuse_pass.cc | 10 ++-- .../framework/ir/seqpool_concat_fuse_pass.h | 3 +- .../ir/seqpool_concat_fuse_pass_tester.cc | 2 +- .../simplify_anakin_detection_pattern_pass.cc | 11 ++--- .../simplify_anakin_detection_pattern_pass.h | 3 +- .../framework/ir/squared_mat_sub_fuse_pass.cc | 10 ++-- .../framework/ir/squared_mat_sub_fuse_pass.h | 3 +- .../framework/ir/sync_batch_norm_pass.cc | 4 +- .../fluid/framework/ir/sync_batch_norm_pass.h | 3 +- .../ir/sync_batch_norm_pass_tester.cc | 2 +- .../ir/transpose_flatten_concat_fuse_pass.cc | 10 ++-- .../ir/transpose_flatten_concat_fuse_pass.h | 3 +- paddle/fluid/framework/parallel_executor.cc | 40 ++++++---------- .../inference/analysis/ir_pass_manager.cc | 4 +- .../ir_passes/anakin_subgraph_pass.cc | 6 +-- .../analysis/ir_passes/anakin_subgraph_pass.h | 3 +- .../ir_passes/tensorrt_subgraph_pass.cc | 17 +++---- .../ir_passes/tensorrt_subgraph_pass.h | 3 +- .../passes/ir_graph_to_program_pass.cc | 4 +- paddle/fluid/pybind/pybind.cc | 4 +- 122 files changed, 370 insertions(+), 539 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index 98a74d630c..d93c84606d 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -42,8 +42,7 @@ VarHandle* GetValidInput(const OpHandleBase* a) { return nullptr; } -std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const { auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph); // get vars order @@ -131,8 +130,6 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( VLOG(10) << "pre_op:" << pre_op->DebugString() << ", op:" << op->DebugString(); } - - return graph; } } // namespace details diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h index e8b9108981..4ed3736587 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.h +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h @@ -24,8 +24,7 @@ namespace details { // TODO(gongwb): overlap allreduce with backward computation. class AllReduceDepsPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc index fbc8bbf56b..e195e93fb8 100644 --- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc +++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc @@ -46,8 +46,7 @@ static framework::proto::VarType::Type kDefaultDtype = class AllocContinuousSpaceForGradPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override { + void ApplyImpl(ir::Graph *graph) const override { ir::Graph &result = *graph; auto &places = Get<const std::vector<platform::Place>>(kPlaces); @@ -65,7 +64,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { if (params_grads.size() == 0) { VLOG(10) << "Doesn't find gradients"; - return std::move(graph); + return; } std::unordered_map<std::string, ir::Node *> vars; @@ -124,8 +123,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, fused_var_name, params_grads); - - return std::move(graph); } template <typename AttrType> diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 5d9db23753..078403f30f 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -204,15 +204,16 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0; } -std::unique_ptr<ir::Graph> BuildStrategy::Apply( - std::unique_ptr<ir::Graph> graph, - const std::vector<platform::Place> &places, - const std::string &loss_var_name, const std::vector<Scope *> &local_scopes, - const size_t &nranks, +ir::Graph *BuildStrategy::Apply(ir::Graph *graph, + const std::vector<platform::Place> &places, + const std::string &loss_var_name, + const std::vector<Scope *> &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { + const bool use_cuda, + platform::NCCLContextMap *nccl_ctxs) const { #else - const bool use_cuda) const { + const bool use_cuda) const { #endif // Create a default one if not finalized by user. CreatePassesFromStrategy(false); @@ -265,7 +266,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( } } VLOG(3) << "Start Apply Pass " << pass->Type(); - graph = pass->Apply(std::move(graph)); + graph = pass->Apply(graph); VLOG(3) << "Finish Apply Pass " << pass->Type(); } return graph; diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 4b599fb914..9587a6f0f9 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -120,16 +120,15 @@ struct BuildStrategy { // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. - std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph, - const std::vector<platform::Place> &places, - const std::string &loss_var_name, - const std::vector<Scope *> &local_scopes, - const size_t &nranks, + ir::Graph *Apply(ir::Graph *graph, const std::vector<platform::Place> &places, + const std::string &loss_var_name, + const std::vector<Scope *> &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - const bool use_cuda, - platform::NCCLContextMap *nccl_ctxs) const; + const bool use_cuda, + platform::NCCLContextMap *nccl_ctxs) const; #else - const bool use_cuda) const; + const bool use_cuda) const; #endif // If set true, ParallelExecutor would build the main_program into multiple diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index a6baa26134..622a59b4c2 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -170,12 +170,10 @@ static OpToVarNameSetMap ShrinkGCVars( class EagerDeletionPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph *graph) const override; }; -std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const { auto &ref_cnts = Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount); PADDLE_ENFORCE(ref_cnts.empty(), @@ -240,7 +238,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( auto while_op_eager_deletion_pass = ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); - return while_op_eager_deletion_pass->Apply(std::move(graph)); + while_op_eager_deletion_pass->Apply(graph); } } // namespace details diff --git a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc index f226491c9f..31efd78ad3 100644 --- a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc @@ -28,8 +28,7 @@ namespace details { class FuseAllReduceOpPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override { + void ApplyImpl(ir::Graph *graph) const override { ir::Graph &result = *graph; auto &places = Get<const std::vector<platform::Place>>(kPlaces); @@ -71,7 +70,7 @@ class FuseAllReduceOpPass : public ir::Pass { VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size(); if (all_reduce_ops.size() == 0) { - return std::move(graph); + return; } PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(), @@ -99,7 +98,6 @@ class FuseAllReduceOpPass : public ir::Pass { group_all_reduce_ops, &result); #endif } - return std::move(graph); } void InsertFusedAllReduce(const std::vector<platform::Place> &places, diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 88f26b4161..afbda33b06 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -144,10 +144,9 @@ void InplacePass::InitSSAGraphNodes() const { } } -std::unique_ptr<ir::Graph> InplacePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void InplacePass::ApplyImpl(ir::Graph* graph) const { var_nodes_.clear(); - view_.Build(graph.get()); + view_.Build(graph); InitSSAGraphNodes(); auto cnt = 0; @@ -155,11 +154,9 @@ std::unique_ptr<ir::Graph> InplacePass::ApplyImpl( VLOG(4) << "Handle op " << cnt++ << ": " << op->Name(); if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) continue; - TryInplaceOpInputOutput(op, graph.get()); + TryInplaceOpInputOutput(op, graph); } // graph->ResolveHazard(var_nodes_); - - return graph; } void InplacePass::InplaceModifyDesc(const std::string& var, diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h index 01964ba8fc..fbec973dda 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.h +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -69,8 +69,7 @@ class InplacePass : public ir::Pass { InplacePass(); protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; void InitSSAGraphNodes() const; diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 80720af32d..ddaef20602 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -44,8 +44,7 @@ namespace paddle { namespace framework { namespace details { -std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const { auto nodes = graph->Nodes(); CollectSkipVarsSet(nodes); @@ -113,7 +112,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); RenameVarInGraphDesc(var_name, cache_name, idx); - RenameVarInGraphNode(var_name, cache_name, idx, graph.get()); + RenameVarInGraphNode(var_name, cache_name, idx, graph); pool_.Erase(cache_name); } } @@ -128,8 +127,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( } } graph->ResolveHazard(var_nodes_); - - return graph; } void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h index 593ffc10fc..ce94890b38 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -21,6 +21,7 @@ #include <set> #include <string> #include <unordered_map> +#include <unordered_set> #include <utility> #include <vector> @@ -35,8 +36,7 @@ namespace details { class MemoryOptimizePass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; // fill the variable map(var_nodes) by version. void InitSSAGraphNodes() const; diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc index 67aad9f94f..ae363f9639 100644 --- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc +++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc @@ -34,8 +34,7 @@ static bool IsLockAndRecordEventFreeComputationOpHandle( return true; } -std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl( - std::unique_ptr<ir::Graph> ir_graph) const { +void ModifyOpLockAndRecordEventPass::ApplyImpl(ir::Graph *ir_graph) const { auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*ir_graph); OpGraphView graph_view(all_ops); for (auto &op : all_ops) { @@ -49,7 +48,6 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl( << compute_op->DebugString(); } } - return ir_graph; } } // namespace details diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h index b54e1b318b..54d52d6240 100644 --- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h +++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h @@ -23,8 +23,7 @@ namespace details { class ModifyOpLockAndRecordEventPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc index a4bb1e26d9..9859b04dec 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc @@ -23,10 +23,8 @@ namespace details { class SSAGraghBuilderWithChecker : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override { - PADDLE_ENFORCE(IsValidGraph(graph.get())); - return graph; + void ApplyImpl(ir::Graph *graph) const override { + PADDLE_ENFORCE(IsValidGraph(graph)); } bool IsValidGraph(const ir::Graph *graph) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 8c61684c9c..f80a098bfa 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -153,8 +153,7 @@ void MultiDevSSAGraphBuilderBase::Init() const { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); } -std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const { Init(); CheckGraph(*graph); std::vector<ir::Node *> sorted_ops = SortOperations(*graph); @@ -236,7 +235,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( AddOutputToLeafOps(&result); result.Erase(kGraphOps); - return graph; } void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 8bfd7b9bf8..884089df38 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -36,8 +36,7 @@ namespace details { class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph *graph) const override; virtual void Init() const; diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index e82eb104fa..34c38ea81a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include <memory> #include <string> +#include <unordered_map> #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h index b06c87a5c1..6d57d75e8a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h @@ -17,6 +17,7 @@ #include <glog/logging.h> #include <fstream> #include <iosfwd> +#include <memory> #include <ostream> #include <string> #include "paddle/fluid/framework/details/multi_devices_helper.h" @@ -40,13 +41,11 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter { class SSAGraghBuilderWithPrinter : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override { + void ApplyImpl(ir::Graph* graph) const override { std::unique_ptr<std::ostream> fout( new std::ofstream(Get<std::string>(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout); - return graph; } }; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 2afac32437..137e0dd770 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -96,7 +96,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( auto seq_allreduce_pass = ir::PassRegistry::Instance().Get("all_reduce_deps_pass"); for (size_t i = 0; i < graphs_.size(); ++i) { - graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i])); + graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release())); } // set the correct size of thread pool to each device. diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index c218e55b70..25337872c1 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -266,8 +266,7 @@ static bool ShrinkNoNeedBufferVarOpDependency( } } -std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const { auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount); auto &last_live_ops_of_vars = Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars); @@ -342,8 +341,6 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( // Just skip this corner case } } - - return graph; } } // namespace details diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h index bcbef02735..7bb01ee616 100644 --- a/paddle/fluid/framework/details/reference_count_pass.h +++ b/paddle/fluid/framework/details/reference_count_pass.h @@ -23,8 +23,7 @@ namespace details { class ReferenceCountPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index 0b53a76e78..839f8dc43e 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -29,8 +29,7 @@ static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) { op1->Outputs() == op2->Outputs(); } -std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void SequentialExecutionPass::ApplyImpl(ir::Graph *graph) const { // FIXME(zjl): Insert dependencies between some distributed ops may cause // the multi_devices_graph_pass fails. So we skip these ops here. // Indeed, maybe we should not insert dependencies between these ops @@ -98,7 +97,6 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl( VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name() << " and " << op_node_list[i]->Name(); } - return graph; } } // namespace details diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h index ea3034877f..7d6a4f4cc5 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.h +++ b/paddle/fluid/framework/details/sequential_execution_pass.h @@ -23,8 +23,7 @@ namespace details { class SequentialExecutionPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc index fd6b6dd227..8f7c99f12a 100644 --- a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc @@ -23,8 +23,7 @@ namespace details { class WhileOpEagerDeletionPass : public ir::Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override { + void ApplyImpl(ir::Graph *graph) const override { auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph); // Find all while_op and while_grad_op @@ -50,7 +49,6 @@ class WhileOpEagerDeletionPass : public ir::Pass { operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( while_ops, while_grad_ops); } - return graph; } }; diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc index 83b0da0c01..39077f6420 100644 --- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc +++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc @@ -29,10 +29,9 @@ namespace ir { GET_IR_NODE(elementwise_mul); \ GET_IR_NODE(elementwise_mul_out); -std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() @@ -69,12 +68,11 @@ std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl( IR_NODE_LINK_TO(scale_op, elementwise_mul_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), + GraphSafeRemoveNodes(graph, {fill_constant, fill_constant_out, elementwise_mul}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h index fa95143d3a..14c07c5884 100644 --- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h +++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h @@ -26,8 +26,7 @@ class AnakinFillconstantElementwisemulFuse : public FusePassBase { virtual ~AnakinFillconstantElementwisemulFuse() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index a9897e0bb8..5a82d7927f 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" #include <string> +#include <unordered_set> #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -253,8 +254,7 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, // Parameters -std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void AttentionLSTMFusePass::ApplyImpl(ir::Graph* graph) const { PDPattern external_pattern, subblock_pattern; // Use the following variables to tell whether this model is RNN1. @@ -269,12 +269,11 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl( } } if (count < specified_vars.size()) { - return graph; + return; } // Continue to fuse. - FindWhileOp(graph.get()); - return graph; + FindWhileOp(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h index 39b0585d3a..47ed9f0393 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h @@ -22,8 +22,7 @@ namespace ir { class AttentionLSTMFusePass : public FusePassBase { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc index a7bfb8cf1e..fecc159ade 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -77,10 +77,9 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, weights_array_2d.colwise() *= scale_array; } -std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -139,7 +138,7 @@ std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl( desc.SetAttr("axis", 1); auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel}); + GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); IR_NODE_LINK_TO(conv_out, eltwise_op); IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); @@ -147,16 +146,14 @@ std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl( found_conv_ac_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_ac_count); - return graph; } -std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -199,7 +196,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl( eltwise->Op()->SetAttr("axis", 1); eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()})); - GraphSafeRemoveNodes(graph.get(), + GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel, eltwise_out}); IR_NODE_LINK_TO(eltwise, ac_out); @@ -207,9 +204,8 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl( found_conv_ac_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_ac_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h index 8c3c8b56c0..d607020a47 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -31,8 +31,7 @@ class ConvAffineChannelFusePass : public FusePassBase { virtual ~ConvAffineChannelFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph*) const override; const std::string name_scope_{"conv_affine_channel_fuse"}; }; @@ -41,8 +40,7 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { virtual ~ConvEltwiseAddAffineChannelFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph*) const override; const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; }; diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index 04765dd144..876a999645 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -101,10 +101,9 @@ void recompute_bias_and_weights(const Scope* scope, weights_array_2d.colwise() *= variance_array; } -std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -187,7 +186,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( std::vector<std::string>({bn_out->Name()})); GraphSafeRemoveNodes( - graph.get(), + graph, {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance}); @@ -203,10 +202,9 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( desc.SetAttr("axis", 1); auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes( - graph.get(), - {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, - bn_variance_out, bn_saved_mean, bn_saved_variance}); + GraphSafeRemoveNodes(graph, {bn_scale, bn_bias, bn_mean, bn_variance, + batch_norm, bn_mean_out, bn_variance_out, + bn_saved_mean, bn_saved_variance}); IR_NODE_LINK_TO(conv_out, eltwise_op); IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); @@ -215,16 +213,14 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( } }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_bn_count); - return graph; } -std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -274,7 +270,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl( eltwise->Op()->SetOutput("Out", std::vector<std::string>({bn_out->Name()})); GraphSafeRemoveNodes( - graph.get(), + graph, {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out}); @@ -283,10 +279,9 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl( found_conv_bn_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_bn_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h index cf425a2730..837a48ed73 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h @@ -31,8 +31,7 @@ class ConvBNFusePass : public FusePassBase { virtual ~ConvBNFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"conv_bn_fuse"}; }; @@ -41,8 +40,7 @@ class ConvEltwiseAddBNFusePass : public FusePassBase { virtual ~ConvEltwiseAddBNFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"conv_eltwiseadd_bn_fuse"}; }; diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc index 6e9905b7ec..99bc5fe8c5 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc @@ -50,10 +50,9 @@ framework::proto::OpDesc PrepareOpDesc( return *desc.Proto(); } -std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_act_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( @@ -95,7 +94,6 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl( elementwise_add_out}); }; gpd(graph.get(), handler); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index c6121777e8..b4d6f683ce 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -51,10 +51,9 @@ framework::proto::OpDesc PrepareOpDesc( return *desc.Proto(); } -std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add2_act_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( @@ -92,12 +91,10 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl( // Delete the unneeded nodes. GraphSafeRemoveNodes( - graph.get(), - {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out, elementwise_add_out_1, act_op}); + graph, {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h index 9259a4ac5c..ea9e465d8d 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -25,8 +25,7 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase { virtual ~ConvElementwiseAdd2ActFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc index fe3b4fca79..ba0a2fb964 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -48,10 +48,9 @@ framework::proto::OpDesc PrepareOpDesc( return *desc.Proto(); } -std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_act_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() @@ -88,12 +87,11 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl( IR_NODE_LINK_TO(new_conv_op, act_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op, - elementwise_add_out, act_op}); + GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op, + elementwise_add_out, act_op}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h index 9c0b50f155..8b34c3551d 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -25,8 +25,7 @@ class ConvElementwiseAddActFusePass : public FusePassBase { virtual ~ConvElementwiseAddActFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc index 476c9dbc35..8c491d4f58 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -30,10 +30,9 @@ namespace ir { GET_IR_NODE(elementwise_add_in_y); \ GET_IR_NODE(elementwise_add_out); -std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() @@ -76,11 +75,10 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl( IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op}); + GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h index bf43bd5ce2..66a562cdd1 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -25,8 +25,7 @@ class ConvElementwiseAddFusePass : public FusePassBase { virtual ~ConvElementwiseAddFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc index ba11f19c92..3a6bbe65b3 100644 --- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h" #include <algorithm> #include <string> +#include <unordered_set> +#include <vector> #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/operators/math/blas.h" @@ -201,7 +203,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, // Remove unneeded nodes. // TODO(jczaja): Proper removing of lookup table std::unordered_set<const Node*> marked_nodes( - //{lookup_table, mul, lstm, elementwise_add, fc_bias, W}); + // {lookup_table, mul, lstm, elementwise_add, fc_bias, W}); {mul, lstm, elementwise_add, fc_bias}); GraphSafeRemoveNodes(graph, marked_nodes); } else { @@ -224,15 +226,13 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr<ir::Graph> EmbeddingFCLSTMFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - true /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h index fde2a0a4ee..65cb443972 100644 --- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h @@ -32,8 +32,7 @@ class EmbeddingFCLSTMFusePass : public FusePassBase { virtual ~EmbeddingFCLSTMFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"embedding_fc_lstm_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 12b31da010..ca008763bf 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/enforce.h" @@ -22,10 +23,9 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("fc_fuse", graph.get()); +void FCFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("fc_fuse", graph); std::unordered_set<Node*> nodes2delete; @@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims")); desc.SetType("fc"); auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); + GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out}); PADDLE_ENFORCE(subgraph.count(x)); IR_NODE_LINK_TO(subgraph.at(x), fc_node); @@ -72,10 +72,9 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( found_fc_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_fc_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h index 783a052edc..0a0fcd2da8 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -31,8 +31,7 @@ class FCFusePass : public FusePassBase { virtual ~FCFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 4e1e4e27f9..affe506910 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -73,7 +73,7 @@ TEST(FCFusePass, basic) { int pre_nodes = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int after_nodes = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index a902b0b50c..5f660c6d36 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" #include <string> +#include <unordered_set> #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -39,7 +40,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, // Create New OpDesc auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h, Node* bias, Node* hidden, Node* fc_bias) { - OpDesc op_desc; op_desc.SetType("fusion_gru"); @@ -155,26 +155,22 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - false /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } -std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - true /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h index e359a32894..e11cdac7ea 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h @@ -30,8 +30,7 @@ class FCGRUFusePass : public FusePassBase { virtual ~FCGRUFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_gru_fuse"}; }; @@ -42,8 +41,7 @@ class MulGRUFusePass : public FusePassBase { virtual ~MulGRUFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_nobias_gru_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index f5c2864865..babeba9614 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" #include <string> +#include <unordered_set> #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -157,26 +158,22 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, return fusion_count; } -std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void MulLstmFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - false /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } -std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - true /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h index 21482615a6..5dea7c91a8 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -32,8 +32,7 @@ class FCLstmFusePass : public FusePassBase { virtual ~FCLstmFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_lstm_fuse"}; }; @@ -43,8 +42,7 @@ class MulLstmFusePass : public FusePassBase { virtual ~MulLstmFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_nobias_lstm_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc index 648acc4a75..bd49673168 100644 --- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h" #include <algorithm> #include <string> +#include <unordered_set> +#include <utility> #include <vector> #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" @@ -23,29 +25,25 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> FuseElewiseAddActPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const { std::unordered_set<std::string> act_types = {"relu", "scale"}; - graph = FuseActElewiseAdd(std::move(graph), act_types); - graph = FuseElewiseAddAct(std::move(graph), act_types); + graph = FuseActElewiseAdd(graph, act_types); + graph = FuseElewiseAddAct(graph, act_types); // backward { std::unordered_set<std::string> in_place_act_types = {"relu_grad"}; - graph = FuseElewiseAddActInplaceGrad(std::move(graph), in_place_act_types); + graph = FuseElewiseAddActInplaceGrad(graph, in_place_act_types); } // Remove the removable intermediate_out. - RemoveIntermediateOut(graph.get()); - - return graph; + RemoveIntermediateOut(graph); } // ele_add(x, act(y)) -std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct( - std::unique_ptr<ir::Graph> graph, - const std::unordered_set<std::string> &act_types) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("elewise_add_act", graph.get()); +ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct( + ir::Graph *graph, const std::unordered_set<std::string> &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("elewise_add_act", graph); GraphPatternDetector gpd; auto *x = gpd.mutable_pattern() @@ -86,18 +84,17 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct( found_elewise_add_act_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_elewise_add_act_count); return graph; } // act(ele_add(x,y)) -std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd( - std::unique_ptr<ir::Graph> graph, - const std::unordered_set<std::string> &act_types) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("act_elewise_add", graph.get()); +ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd( + ir::Graph *graph, const std::unordered_set<std::string> &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("act_elewise_add", graph); GraphPatternDetector gpd; auto *x = gpd.mutable_pattern() @@ -137,7 +134,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd( found_elewise_add_act_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_elewise_add_act_count); return graph; @@ -146,11 +143,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd( // the backward of act(ele_add(x,y)) // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] -std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( - std::unique_ptr<ir::Graph> graph, - const std::unordered_set<std::string> &act_types) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("elewise_add_act_grad", graph.get()); +ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( + ir::Graph *graph, const std::unordered_set<std::string> &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("elewise_add_act_grad", graph); GraphPatternDetector gpd; auto *d_act_out = gpd.mutable_pattern() @@ -217,7 +213,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( found_elewise_add_act_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_elewise_add_act_count); return graph; diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h index 0fee527447..dc73f1fda0 100644 --- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h @@ -14,6 +14,8 @@ #pragma once #include <string> +#include <unordered_set> +#include <utility> #include <vector> #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" @@ -32,20 +34,16 @@ class FuseElewiseAddActPass : public FusePassBase { virtual ~FuseElewiseAddActPass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph *graph) const override; - std::unique_ptr<ir::Graph> FuseElewiseAddAct( - std::unique_ptr<ir::Graph> graph, - const std::unordered_set<std::string> &act_types) const; + ir::Graph *FuseElewiseAddAct( + ir::Graph *graph, const std::unordered_set<std::string> &act_types) const; - std::unique_ptr<ir::Graph> FuseActElewiseAdd( - std::unique_ptr<ir::Graph> graph, - const std::unordered_set<std::string> &act_types) const; + ir::Graph *FuseActElewiseAdd( + ir::Graph *graph, const std::unordered_set<std::string> &act_types) const; - std::unique_ptr<ir::Graph> FuseElewiseAddActInplaceGrad( - std::unique_ptr<ir::Graph> graph, - const std::unordered_set<std::string> &act_types) const; + ir::Graph *FuseElewiseAddActInplaceGrad( + ir::Graph *graph, const std::unordered_set<std::string> &act_types) const; /** * Remove the removable intermediate_out. diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc index fe844caed2..c4e6b6e6a5 100644 --- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h" #include <algorithm> #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" @@ -23,20 +24,18 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - graph = FuseReluDepthwiseConv(std::move(graph), true); - graph = FuseReluDepthwiseConv(std::move(graph), false); - return graph; +void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const { + graph = FuseReluDepthwiseConv(graph, true); + graph = FuseReluDepthwiseConv(graph, false); } -std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( - std::unique_ptr<ir::Graph> graph, bool only_forward) const { - PADDLE_ENFORCE(graph.get()); +ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( + ir::Graph *graph, bool only_forward) const { + PADDLE_ENFORCE(graph); if (only_forward) - FusePassBase::Init("relu_depthwise_conv_only_forward", graph.get()); + FusePassBase::Init("relu_depthwise_conv_only_forward", graph); else - FusePassBase::Init("relu_depthwise_conv", graph.get()); + FusePassBase::Init("relu_depthwise_conv", graph); /* x ---act--> y ---layer-> z +----------+ @@ -144,10 +143,9 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( } count++; }; - gpd(graph.get(), handler); - GraphSafeRemoveNodes(graph.get(), need_removed_nodes); + gpd(graph, handler); + GraphSafeRemoveNodes(graph, need_removed_nodes); AddStatis(count); - return graph; } diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h index efb49b8300..d37c153dd2 100644 --- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h @@ -32,10 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase { virtual ~FuseReluDepthwiseConvPass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; - std::unique_ptr<ir::Graph> FuseReluDepthwiseConv( - std::unique_ptr<ir::Graph> graph, bool only_forward) const; + void ApplyImpl(ir::Graph* graph) const override; + ir::Graph* FuseReluDepthwiseConv(ir::Graph* graph, bool only_forward) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc index 3372dcd181..b0d056f2c0 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include <map> +#include <memory> #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/ir/graph.h" @@ -26,8 +28,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl( - std::unique_ptr<Graph> graph) const { +void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const { // Remove the unneeded variables after memory optimization. std::unordered_set<std::string> vars2remove; if (graph->Has(kGraphToProgramVarsToRemove)) { @@ -73,7 +74,6 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl( } program.CopyFrom(*program_pb); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h index 4c36c3a5da..52c8f4e0fc 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.h +++ b/paddle/fluid/framework/ir/graph_to_program_pass.h @@ -26,7 +26,7 @@ const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__"; class GraphToProgramPass : public Pass { protected: - std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc index 5d51d9751a..5ee6b8a5f1 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include <memory> #include <string> +#include <unordered_set> #include <vector> #include "gtest/gtest.h" #include "paddle/fluid/framework/program_desc.h" @@ -84,7 +86,7 @@ TEST(GraphToProgramPass, Basic) { ProgramDesc compiled_prog; pass->SetNotOwned<paddle::framework::ProgramDesc>("program", &compiled_prog); - pass->Apply(std::move(g)); + pass->Apply(g.get()); std::vector<OpDesc*> ops = compiled_prog.Block(0).AllOps(); EXPECT_EQ(ops[0]->Type(), "op1"); EXPECT_EQ(ops[1]->Type(), "op2"); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 87a28a2a66..f4df4cfeba 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include <algorithm> +#include <unordered_map> #include <unordered_set> - -#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/string/printf.h" @@ -38,8 +38,7 @@ std::string FormatName(const Node* node) { } } // namespace -std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void GraphVizPass::ApplyImpl(ir::Graph* graph) const { const std::string graph_viz_path = Get<std::string>(kGraphVizPath); VLOG(3) << "draw IR graph viz to " << graph_viz_path; std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path)); @@ -82,7 +81,7 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"), Dot::Attr("fillcolor", "yellow")}); - auto marked_nodes = ConsumeMarkedNodes(graph.get()); + auto marked_nodes = ConsumeMarkedNodes(graph); // Create nodes for (const Node* n : graph->Nodes()) { std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")"; @@ -115,8 +114,6 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( } sout << dot.Build(); - - return graph; } GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( @@ -135,4 +132,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( } // namespace paddle REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) - .RequirePassAttr(paddle::framework::ir::kGraphVizPath); \ No newline at end of file + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h index e64916a5bb..7091aa6a95 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.h +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -18,6 +18,7 @@ limitations under the License. */ #include <map> #include <memory> #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/ir/graph.h" @@ -34,8 +35,7 @@ class GraphVizPass : public Pass { using marked_nodes_t = std::unordered_set<const Node*>; protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; // Tell whether there are any marked nodes in the graph. Consume the // corresponding attribute. diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc index 5bdc0c5fae..a39901e63b 100644 --- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -20,9 +20,8 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init("identity_scale_op_clean", graph.get()); +void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("identity_scale_op_clean", graph); // pre_op -> scale_in -> scale_op -> scale_out // -> @@ -72,8 +71,7 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl( IR_NODE_LINK_TO(pre_op_var, scale_out_var); }; - detector(graph.get(), handler); - return graph; + detector(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h index 6da592561d..d66b411257 100644 --- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h @@ -22,8 +22,7 @@ namespace ir { class IdentityScaleOpCleanPass : public FusePassBase { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; private: virtual ~IdentityScaleOpCleanPass() = default; diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc index 6607c026a7..d76924116f 100644 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -26,9 +26,9 @@ class InferCleanGraphPass : public FusePassBase { virtual ~InferCleanGraphPass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init("original_graph", graph.get()); - PADDLE_ENFORCE(graph.get()); + void ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("original_graph", graph); + PADDLE_ENFORCE(graph); auto is_valid_node = [](Node* x) { return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); @@ -46,11 +46,9 @@ class InferCleanGraphPass : public FusePassBase { } } - GraphSafeRemoveNodes(graph.get(), invalid_nodes); + GraphSafeRemoveNodes(graph, invalid_nodes); AddStatis(valid_op); - - return graph; } void CleanEdges(std::vector<Node*>* nodes, diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 57cc98e2ca..bf6fe999c1 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -20,8 +20,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void IsTestPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it " "for activations and pooling."; auto op_list = {"pool2d", "sigmoid", "logsigmoid", @@ -47,7 +46,6 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/is_test_pass.h b/paddle/fluid/framework/ir/is_test_pass.h index 99e76ca4a3..80cedbf9f8 100644 --- a/paddle/fluid/framework/ir/is_test_pass.h +++ b/paddle/fluid/framework/ir/is_test_pass.h @@ -22,8 +22,7 @@ namespace ir { class IsTestPass : public Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index 9696441a21..3fa543c622 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -97,7 +97,7 @@ TEST(IsTestPass, basic) { auto pass = PassRegistry::Instance().Get("is_test_pass"); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); for (auto* node : graph->Nodes()) { if (node->IsOp()) { diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc index 92e897ca9c..05d23961a8 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc @@ -32,9 +32,8 @@ const char kSumGradOpName[] = "sum"; // other optimizers later. const char kOptimizerType[] = "sgd"; -std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); +void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); // We could collect all weights' name from SGD, where // W1 <- SGD(W0, Grad0) @@ -92,14 +91,14 @@ std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl( // find the forward op related to the backward op ir::Node* forward_op = - FindForwardOpViaBackwardOp(graph.get(), backward_op); + FindForwardOpViaBackwardOp(graph, backward_op); VLOG(3) << "Found forward_op " << forward_op->Name(); PADDLE_ENFORCE(forward_op); Node* new_optimizer_node = CreateNewSGDNode( - graph.get(), forward_op, backward_op, node, opt_node); + graph, forward_op, backward_op, node, opt_node); PADDLE_ENFORCE(new_optimizer_node); } @@ -140,8 +139,6 @@ std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl( } } } - - return graph; } ir::Node* LockFreeOptimizePass::CreateNewSGDNode( diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h index f9157b10d9..d1718857a5 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -60,8 +60,7 @@ class LockFreeOptimizePass : public Pass { virtual ~LockFreeOptimizePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; private: // Create a new sgd node via current optimizer node diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index 5d0b294f6f..8ef3993b06 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -38,10 +38,9 @@ LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b, return vec_y; } -std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -99,7 +98,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl( conv->Op()->SetOutput("Output", std::vector<std::string>({eltwise_out->Name()})); - GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out}); + GraphSafeRemoveNodes(graph, {eltwise, conv_out}); IR_NODE_LINK_TO(conv, eltwise_out); } else { @@ -123,14 +122,13 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl( IR_NODE_LINK_TO(eltwise_bias, conv_bias_node); IR_NODE_LINK_TO(conv_bias_node, eltwise_out); - GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out}); + GraphSafeRemoveNodes(graph, {conv, eltwise, conv_out}); } found_conv_bias_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_bias_count); - return graph; } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h index 0ef5c177bf..84106d0655 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h @@ -29,8 +29,7 @@ class ConvBiasFusePass : public FusePassBase { virtual bool is_conv3d() const { return false; } protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"conv_bias_mkldnn_fuse"}; }; /* diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc index 38b7fe5203..ff7f9190fd 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" +#include <gtest/gtest.h> #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/platform/place.h" -#include <gtest/gtest.h> #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { @@ -103,7 +103,7 @@ void MainTest(bool convWithExistingBias) { int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index fb3db81347..ef7874c1c0 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -16,8 +16,8 @@ #include <functional> #include <list> #include <map> +#include <memory> #include <tuple> - #include "paddle/fluid/framework/ir/graph_traits.h" namespace paddle { @@ -327,17 +327,15 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( get_node_from_elementwise_add); } -graph_ptr ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { + FusePassBase::Init(name_scope_, graph); auto fused_graph_with_stats = FuseConvAsY( name_scope_, - FuseConvAsX( - name_scope_, - FuseProjectionConv(name_scope_, std::make_pair(graph.get(), 0)))); + FuseConvAsX(name_scope_, + FuseProjectionConv(name_scope_, std::make_pair(graph, 0)))); std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl; AddStatis(fused_graph_with_stats.second); - return graph; } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index 6629dae425..9bf1ae6079 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -14,6 +14,7 @@ #pragma once +#include <memory> #include <string> #include <tuple> #include <utility> @@ -27,7 +28,7 @@ namespace paddle { namespace framework { namespace ir { -using graph_ptr = std::unique_ptr<ir::Graph>; +using graph_ptr = ir::Graph*; using GraphWithStats = std::pair<ir::Graph*, int>; void CorrectGraphEdges(Graph* graph, Node* from, Node* to); @@ -124,7 +125,7 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { virtual ~ResidualConnectionMKLDNNFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const; + void ApplyImpl(graph_ptr graph) const; const std::string name_scope_{"residual_connection_fuse_pass"}; }; diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc index 433d89d8d3..8a13596cd5 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -148,7 +148,7 @@ void RunPassAndAssert(ProgramDesc* prog, const std::string& from, auto pass = PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); EXPECT_TRUE(is_reachable(graph)(from, to)); @@ -258,7 +258,7 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) { auto pass = PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); EXPECT_TRUE(is_reachable(graph)("a", "g")); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc index 4f4605398a..dd0fb45604 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc @@ -21,10 +21,9 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); +void ConvReLUFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("conv_relu_mkldnn_fuse", graph); GraphPatternDetector gpd; auto* conv_input = gpd.mutable_pattern() @@ -56,7 +55,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( OpDesc* desc = conv->Op(); desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()})); desc->SetAttr("fuse_relu", true); - GraphSafeRemoveNodes(graph.get(), {relu, conv_out}); + GraphSafeRemoveNodes(graph, {relu, conv_out}); PADDLE_ENFORCE(subgraph.count(conv_input)); IR_NODE_LINK_TO(conv, relu_out); @@ -64,10 +63,9 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( found_conv_relu_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_relu_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h index fe585bd7c4..2174c22dbf 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h @@ -31,8 +31,7 @@ class ConvReLUFusePass : public FusePassBase { virtual ~ConvReLUFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc index 06d56f6222..67a9957059 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc @@ -88,7 +88,7 @@ TEST(ConvReLUFusePass, basic) { int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index b3a8c20891..dff98e523a 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -216,19 +216,16 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const { PrettyLogDetail("--- quantized %d pool2d ops", quantize_pool_count); } -std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Quantizing the graph."; - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); PADDLE_ENFORCE(param_scope()); - QuantizeConv(graph.get(), false /* with_residual_data */); - QuantizeConv(graph.get(), true /* with_residual_data */); - QuantizePool(graph.get()); - - return graph; + QuantizeConv(graph, false /* with_residual_data */); + QuantizeConv(graph, true /* with_residual_data */); + QuantizePool(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 9873bb04e1..a178c4dc36 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -42,8 +42,7 @@ class CPUQuantizePass : public FusePassBase { virtual ~CPUQuantizePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; void QuantizeConv(Graph* graph, bool with_residual_data = false) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 0d0ed98901..8716a412e4 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -139,7 +139,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 511003dce5..79a8ac68b8 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -20,8 +20,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; const auto& excluded_ids_list = Get<std::unordered_set<int>>("quantize_excluded_op_ids"); @@ -43,7 +42,6 @@ std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h index ef3861b249..008a462dc4 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h @@ -25,8 +25,7 @@ namespace ir { */ class CPUQuantizePlacementPass : public Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc index 11d72a56bd..ba4d281f81 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -94,7 +94,7 @@ void MainTest(std::initializer_list<std::string> quantize_enabled_op_types, pass->Set("quantize_excluded_op_ids", new std::unordered_set<int>(quantize_excluded_op_ids)); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); unsigned use_quantizer_true_count = 0; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index 6e74cc7787..debbbd6440 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -126,16 +126,13 @@ void CPUQuantizeSquashPass::Squash( found_squash_count); } -std::unique_ptr<ir::Graph> CPUQuantizeSquashPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("cpu_quantize_squash_pass", graph.get()); +void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("cpu_quantize_squash_pass", graph); std::unordered_map<const Node*, int> nodes_keep_counter; - FindNodesToKeep(graph.get(), &nodes_keep_counter); - Squash(graph.get(), &nodes_keep_counter); - - return graph; + FindNodesToKeep(graph, &nodes_keep_counter); + Squash(graph, &nodes_keep_counter); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h index b823a2cef3..e873994c57 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h @@ -34,8 +34,7 @@ class CPUQuantizeSquashPass : public FusePassBase { virtual ~CPUQuantizeSquashPass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; /* * For each dequantize's output find the number of operators it is an input to diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc index 3cf51d97aa..fda337066f 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -125,7 +125,7 @@ void MainTest(const ProgramDesc& prog, int removed_nodes_num) { int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc index 7851e8c84b..e854559ae7 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc @@ -25,10 +25,9 @@ namespace ir { auto* id = subgraph.at(pattern.RetrieveNode(#id)); \ PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); -std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get()); +void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("depthwise_conv_mkldnn_pass", graph); GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); @@ -45,9 +44,8 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl( found_depthwise_conv_mkldnn_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_depthwise_conv_mkldnn_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h index 8ca6a73251..ca314afde5 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h @@ -25,8 +25,7 @@ class DepthwiseConvMKLDNNPass : public FusePassBase { virtual ~DepthwiseConvMKLDNNPass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc index 1783e3322b..f2dfbc84a5 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc @@ -86,7 +86,7 @@ TEST(DepthwiseConvMKLDNNPass, basic) { counters before{1, 1, 1, 1}; - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); // initialize counters before loop counters after{0, 0, 0, 0}; diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc index ccac65f3b3..500419e4b7 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc @@ -14,13 +14,13 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" #include <string> +#include <unordered_set> namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void MKLDNNPlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Applies MKL-DNN placement strategy."; const auto& op_types_list = Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types"); @@ -37,7 +37,6 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h index c071d9aed2..ffa62273ec 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h @@ -26,8 +26,7 @@ namespace ir { */ class MKLDNNPlacementPass : public Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc index b6ec7e4d68..5885f327e6 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc @@ -97,7 +97,7 @@ void MainTest(std::initializer_list<std::string> mkldnn_enabled_op_types, pass->Set("mkldnn_enabled_op_types", new std::unordered_set<std::string>(mkldnn_enabled_op_types)); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); unsigned use_mkldnn_true_count = 0; diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index 9e77f98e9e..dcc48fb934 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -16,8 +16,9 @@ #include <map> #include <string> +#include <unordered_map> +#include <unordered_set> #include <vector> - #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" @@ -68,8 +69,7 @@ VarDesc UpdateGradVarDesc( return *var_desc; } -std::unique_ptr<Graph> BatchMergePass::ApplyImpl( - std::unique_ptr<Graph> graph) const { +void BatchMergePass::ApplyImpl(ir::Graph* graph) const { int num_repeats = Get<const int>(kNumRepeats); std::vector<Node*> forward_backward_ops; std::vector<Node*> optimize_ops; @@ -325,7 +325,6 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( } result.ResolveHazard(created); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h index c1e5aef20d..a89616683d 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h @@ -36,7 +36,7 @@ class BatchMergePass : public Pass { virtual ~BatchMergePass() {} protected: - std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override; + void ApplyImpl(Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 33ccee6aa0..c0ed0519b1 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -18,8 +18,8 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const { - PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); +Graph* Pass::Apply(Graph* graph) const { + PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), "Required pass atrribute %s not set.", attr); @@ -28,16 +28,16 @@ std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const { PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", attr); } - auto* native_graph = graph.get(); - auto applied_graph = ApplyImpl(std::move(graph)); + auto* native_graph = graph; + ApplyImpl(graph); // TODO(panyx0718): Add more verifications. - PADDLE_ENFORCE(!HasCircle(*applied_graph), + PADDLE_ENFORCE(!HasCircle(*graph), "Illegal Pass. Generated graph shouldn't has cycle."); - PADDLE_ENFORCE(applied_graph.get() == native_graph, + PADDLE_ENFORCE(graph == native_graph, "Pass::Apply() cannot delete the passed graph and shouldn't " "return a new graph.(For the need of pybind11)"); applied_ = true; - return applied_graph; + return graph; } PassRegistry& PassRegistry::Instance() { diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 27746ff145..6cbe9a8212 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -16,8 +16,10 @@ limitations under the License. */ #include <functional> #include <map> +#include <memory> #include <string> - +#include <unordered_map> +#include <unordered_set> #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" @@ -44,7 +46,7 @@ class Pass { std::string Type() const { return type_; } - std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const; + Graph *Apply(Graph *graph) const; // Get a reference to the attributed previously set. template <typename AttrType> @@ -98,9 +100,8 @@ class Pass { } protected: - virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const { + virtual void ApplyImpl(Graph *graph) const { LOG(FATAL) << "Calling virtual Pass not implemented."; - return graph; } private: diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc index 6ad7d1df8b..87e3c96416 100644 --- a/paddle/fluid/framework/ir/pass_test.cc +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" +#include <memory> #include <string> +#include <utility> #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph.h" @@ -39,7 +41,7 @@ void BuildCircleGraph(Graph* g) { class TestPass : public Pass { protected: - std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const { + void ApplyImpl(ir::Graph* graph) const { graph->Set<int>("copy_test_pass_attr", new int); graph->Set<int>("copy_test_graph_attr", new int); @@ -48,7 +50,6 @@ class TestPass : public Pass { int test_graph_attr = graph->Get<int>("test_graph_attr"); graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1; - return graph; } }; @@ -58,7 +59,7 @@ TEST(PassTest, TestPassAttrCheck) { std::unique_ptr<Graph> graph(new Graph(prog)); std::string exception; try { - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); } catch (paddle::platform::EnforceNotMet e) { exception = std::string(e.what()); } @@ -69,7 +70,7 @@ TEST(PassTest, TestPassAttrCheck) { pass->SetNotOwned<int>("test_pass_attr", &val); try { - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); } catch (paddle::platform::EnforceNotMet e) { exception = std::string(e.what()); } @@ -78,14 +79,14 @@ TEST(PassTest, TestPassAttrCheck) { graph.reset(new Graph(prog)); graph->Set<int>("test_graph_attr", new int); graph->Get<int>("test_graph_attr") = 1; - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2); ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2); // Allow apply more than once. graph.reset(new Graph(prog)); graph->Set<int>("test_graph_attr", new int); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); pass = PassRegistry::Instance().Get("test_pass"); pass->SetNotOwned<int>("test_pass_attr", &val); @@ -94,7 +95,7 @@ TEST(PassTest, TestPassAttrCheck) { graph->Set<int>("test_graph_attr", new int); graph->Get<int>("test_graph_attr") = 2; try { - auto tmp = pass->Apply(std::move(graph)); + pass->Apply(graph.release()); } catch (paddle::platform::EnforceNotMet e) { exception = std::string(e.what()); } diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc index 84a4ff2de1..00263b8a34 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" #include <algorithm> // for max #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/lod_tensor.h" @@ -365,17 +366,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr<ir::Graph> RepeatedFCReluFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); int fusion_count = 0; for (int i = MAX_NUM_FC; i > 1; --i) { fusion_count += - BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i); + BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i); } AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h index ede0bea07f..ae777bcceb 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h @@ -31,8 +31,7 @@ class RepeatedFCReluFusePass : public FusePassBase { virtual ~RepeatedFCReluFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"repeated_fc_relu_fuse"}; }; diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc index 67b29512c4..c7cf9b0dc3 100644 --- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc @@ -20,15 +20,13 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Applies Runtime Context Cache strategy."; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { n->Op()->SetAttr(kEnableCacheRuntimeContext, true); } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h index a6cf1a9ae5..e4783166e0 100644 --- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h @@ -23,8 +23,7 @@ namespace ir { class RuntimeContextCachePass : public Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc index 012e68036c..b230c50167 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" #include <set> #include <string> - +#include <unordered_set> #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" -#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -178,9 +178,8 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { return fc_out; } -std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init("seq_concat_fc_fuse", graph.get()); +void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("seq_concat_fc_fuse", graph); GraphPatternDetector detector; auto* pattern = detector.mutable_pattern(); auto* concat_out = BuildSeqExpandConcatPattern(pattern); @@ -194,8 +193,8 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( int fuse_count{0}; - detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph) { + detector(graph, [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { VLOG(4) << "get one concat pattern"; // fc GET_NODE(fc_w, detector.pattern()); @@ -246,8 +245,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( }); AddStatis(fuse_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h index 06e18f9dc3..d68840a554 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h @@ -27,8 +27,7 @@ class SeqConcatFcFusePass : public FusePassBase { virtual ~SeqConcatFcFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc index 0a1f65d274..3fd368741f 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h" #include <string> +#include <unordered_set> #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -83,14 +84,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { return fusion_count; } -std::unique_ptr<ir::Graph> SeqConvEltAddReluFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope()); + int fusion_count = BuildFusion(graph, name_scope_, param_scope()); AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h index c36c6b76a2..fde9b586c8 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h @@ -28,8 +28,7 @@ class SeqConvEltAddReluFusePass : public FusePassBase { virtual ~SeqConvEltAddReluFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"seqconv_eltadd_relu_fuse"}; }; diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc index 63a0c24f2a..4ac379eb04 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/lod_tensor.h" @@ -194,17 +195,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr<ir::Graph> SeqPoolConcatFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void SeqPoolConcatFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); int fusion_count = 0; for (int i = MAX_CONCAT_INPUTS; i > 0; --i) { fusion_count += - BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i); + BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i); } AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h index a5db3528da..40a9edc5e6 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h @@ -42,8 +42,7 @@ class SeqPoolConcatFusePass : public FusePassBase { virtual ~SeqPoolConcatFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"seqpool_concat_fuse"}; }; diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc index 35d1d5129b..d366803851 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -59,7 +59,7 @@ std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter( const std::string& pass_type = "seqpool_concat_fuse_pass") { auto pass = PassRegistry::Instance().Get(pass_type); *before = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); *after = graph->Nodes().size(); return graph; } diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc index 84fb8063e6..e1ddc44470 100644 --- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc @@ -24,11 +24,11 @@ namespace framework { namespace ir { template <int times> -std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl( + ir::Graph *graph) const { const std::string pattern_name = "simplify_anakin_detection_pattern_pass" + std::to_string(times); - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; std::vector<PDNode *> input_nodes; @@ -207,11 +207,10 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl( multiclass_nms_out->inputs.push_back(detection_out_op); // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), delete_nodes); + GraphSafeRemoveNodes(graph, delete_nodes); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } template class SimplifyAnakinDetectionPatternPass<1>; diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h index 2338e4c38b..e4a266cbe8 100644 --- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h @@ -32,8 +32,7 @@ class SimplifyAnakinDetectionPatternPass : public FusePassBase { virtual ~SimplifyAnakinDetectionPatternPass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc index 78c8cabb10..42f4a91a6f 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h" #include <string> +#include <unordered_set> #include <vector> #include "paddle/fluid/framework/lod_tensor.h" @@ -362,13 +363,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { return fusion_count; } -std::unique_ptr<ir::Graph> SquaredMatSubFusePass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { - FusePassBase::Init(name_scope_, graph.get()); - int fusion_count = BuildFusion(graph.get(), name_scope_); +void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + int fusion_count = BuildFusion(graph, name_scope_); AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h index c21ba65c40..b6165a512a 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h @@ -31,8 +31,7 @@ class SquaredMatSubFusePass : public FusePassBase { virtual ~SquaredMatSubFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"squared_mat_sub_fuse"}; }; diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc index b370039915..f4f924a604 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc @@ -21,8 +21,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void SyncBatchNormPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Use synchronous batch norm"; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { @@ -35,7 +34,6 @@ std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.h b/paddle/fluid/framework/ir/sync_batch_norm_pass.h index 51cce3dca6..694fae7494 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass.h +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h @@ -23,8 +23,7 @@ namespace ir { class SyncBatchNormPass : public Pass { protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc index 9c94c1746a..894f96050e 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc @@ -60,7 +60,7 @@ TEST(IsTestPass, basic) { auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass"); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); for (auto* node : graph->Nodes()) { if (node->IsOp()) { diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index cab69c408d..61c12d4b6e 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -26,11 +26,10 @@ namespace framework { namespace ir { template <int times> -std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl( - std::unique_ptr<ir::Graph> graph) const { +void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const { const std::string pattern_name = "transpose_flatten" + std::to_string(times) + "_concat_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; std::vector<PDNode *> input_nodes; @@ -117,11 +116,10 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl( concat_out->inputs.push_back(new_conv_op); // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), delete_nodes); + GraphSafeRemoveNodes(graph, delete_nodes); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } template class TransposeFlattenConcatFusePass<1>; diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h index a7d18ec86d..366d26d800 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h @@ -30,8 +30,7 @@ class TransposeFlattenConcatFusePass : public FusePassBase { virtual ~TransposeFlattenConcatFusePass() {} protected: - std::unique_ptr<ir::Graph> ApplyImpl( - std::unique_ptr<ir::Graph> graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 20a8c47d5d..ab0947c631 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -77,8 +77,7 @@ class ParallelExecutorPrivate { } } - std::unique_ptr<ir::Graph> PrepareGCAndRefCnts( - std::unique_ptr<ir::Graph> graph, size_t max_memory_size); + ir::Graph *PrepareGCAndRefCnts(ir::Graph *graph, size_t max_memory_size); inline bool HasGarbageCollectors() const { return !gcs_.empty(); } @@ -118,8 +117,8 @@ class ParallelExecutorPrivate { details::GarbageCollectorMap gcs_; }; -std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts( - std::unique_ptr<ir::Graph> graph, size_t max_memory_size) { +ir::Graph *ParallelExecutorPrivate::PrepareGCAndRefCnts( + ir::Graph *graph, size_t max_memory_size) { for (size_t i = 0; i < places_.size(); ++i) { auto &place = places_[i]; if (gcs_.count(place) > 0) { @@ -161,7 +160,7 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts( &global_ref_cnts_); ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, &last_live_ops_of_vars); - graph = ref_cnt_pass->Apply(std::move(graph)); + graph = ref_cnt_pass->Apply(graph); VLOG(10) << "ReferenceCountPass Applied"; auto eager_deletion_pass = @@ -172,10 +171,9 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, &last_live_ops_of_vars); eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); - graph = eager_deletion_pass->Apply(std::move(graph)); + graph = eager_deletion_pass->Apply(graph); VLOG(10) << "EagerDeletionPass Applied"; } - return graph; } @@ -220,13 +218,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, } } - std::unique_ptr<ir::Graph> temp_owned_graph(graph); - // FIXME(Yancey1989): parallel graph mode get better performance // in GPU allreduce distributed training. Need an elegant way to // choice the execution strategy. - build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution( - *temp_owned_graph, exec_strategy, build_strategy); + build_strategy.enable_parallel_graph_ = + EnableParallelGraphExecution(*graph, exec_strategy, build_strategy); if (build_strategy.enable_parallel_graph_) VLOG(0) << "The Executor would execute the graph by ParallelGraph " "Execution which can get better performance," @@ -304,27 +300,21 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - - temp_owned_graph = build_strategy.Apply( - std::move(temp_owned_graph), member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, member_->use_cuda_, - member_->nccl_ctxs_.get()); + graph = build_strategy.Apply(graph, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_, member_->nccl_ctxs_.get()); #else - temp_owned_graph = build_strategy.Apply( - std::move(temp_owned_graph), member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, member_->use_cuda_); + graph = build_strategy.Apply(graph, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_); #endif auto max_memory_size = GetEagerDeletionThreshold(); VLOG(10) << "Eager Deletion Threshold " << static_cast<float>(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { - graph = member_ - ->PrepareGCAndRefCnts(std::move(temp_owned_graph), - static_cast<size_t>(max_memory_size)) - .release(); - } else { - graph = temp_owned_graph.release(); + graph = member_->PrepareGCAndRefCnts(graph, + static_cast<size_t>(max_memory_size)); } // Step 3. Create vars in each scope. Passes may also create new vars. diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 7a96ac11d8..78e502c670 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -140,7 +140,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { if (pass->Type() != "graph_viz_pass") { PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); } - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); } return graph; } @@ -156,7 +156,7 @@ framework::proto::ProgramDesc IRPassManager::AcquireProgram( desc.CopyFrom(*program->Proto()); pass->SetNotOwned("program", &desc); auto *the_graph = graph->release(); - *graph = pass->Apply(std::unique_ptr<Graph>(the_graph)); + graph->reset(pass->Apply(the_graph)); return *desc.Proto(); } diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 12deed2533..9e05aa5c16 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -35,8 +35,8 @@ namespace analysis { using framework::ir::Node; -std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl( - std::unique_ptr<framework::ir::Graph> graph) const { +void analysis::AnakinSubgraphPass::ApplyImpl( + framework::ir::Graph *graph) const { framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get()); auto teller = [](const framework::ir::Node *node) { @@ -72,8 +72,6 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl( framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); graph->Set(framework::ir::kRepetitiveParamAttr, new std::vector<std::string>(repetitive_params)); - - return graph; } std::string GenerateAnakinEngineKey(const std::set<std::string> &engine_inputs, diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h index c13b9ecda4..e80b8bb612 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -29,8 +29,7 @@ namespace analysis { class AnakinSubgraphPass : public framework::ir::FusePassBase { public: - std::unique_ptr<framework::ir::Graph> ApplyImpl( - std::unique_ptr<framework::ir::Graph> graph) const override; + void ApplyImpl(framework::ir::Graph *graph) const override; private: void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph, diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 5939940327..ef5872c52c 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -31,16 +31,16 @@ namespace analysis { using framework::ir::Node; -std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( - std::unique_ptr<framework::ir::Graph> graph) const { - framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); +void analysis::TensorRtSubgraphPass::ApplyImpl( + framework::ir::Graph *graph) const { + framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph); auto teller = [](const framework::ir::Node *node) { if (!node->IsOp() || !node->Op()) return false; return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); }; - SubGraphFuser fuser(graph.get(), teller, + SubGraphFuser fuser(graph, teller, Get<int>("min_subgraph_size") /*min subgraph size*/); fuser(); @@ -52,12 +52,11 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( for (auto *node : graph->Nodes()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) { - CreateTensorRTOp(node, graph.get(), graph_param_names, - &repetitive_params); + CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params); std::unordered_set<const Node *> nodes2remove( Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); - framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); } } @@ -67,11 +66,9 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( nodes2remove.insert(node); } } - framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); graph->Set(framework::ir::kRepetitiveParamAttr, new std::vector<std::string>(repetitive_params)); - - return graph; } std::string GenerateEngineKey(const std::set<std::string> &engine_inputs, diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index f043670c5a..f530a5a0b3 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -28,8 +28,7 @@ namespace analysis { class TensorRtSubgraphPass : public framework::ir::FusePassBase { public: - std::unique_ptr<framework::ir::Graph> ApplyImpl( - std::unique_ptr<framework::ir::Graph> graph) const override; + void ApplyImpl(framework::ir::Graph *graph) const override; private: void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph, diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc index 6b3d80fcef..35df396fe8 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" +#include <memory> #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/program_desc.h" @@ -37,8 +38,7 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { framework::ProgramDesc desc; desc.CopyFrom(*argument->main_program().Proto()); pass->SetNotOwned("program", &desc); - auto thegraph = pass->Apply(std::move(graph)); - thegraph.release(); // the argument still own the graph. + pass->Apply(graph.release()); // the argument still own the graph. argument->SetIrAnalyzedProgram( new framework::proto::ProgramDesc(*desc.Proto())); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3b0939ef82..d4c85fd0c6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1046,9 +1046,7 @@ All parameter, weight, gradient are variables in Paddle. int val) { self.Set<const int>(name, new int(val)); }) .def("type", &ir::Pass::Type) .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) { - std::unique_ptr<ir::Graph> origin_graph(graph.get()); - auto optim_graph = self.Apply(std::move(origin_graph)); - optim_graph.release(); + self.Apply(graph.get()); }); py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb( From 2265d091e6a0e18b48f801e73f048112ecc24904 Mon Sep 17 00:00:00 2001 From: chengduo <zhaochengduo@baidu.com> Date: Thu, 28 Mar 2019 07:42:52 -0500 Subject: [PATCH 17/19] Fix threaded executor bug (#16508) * fix threaded executor bug test=develop * change the order of class member test=develop * Fix Travis CI test=develop --- .../fast_threaded_ssa_graph_executor.cc | 5 +++-- .../fast_threaded_ssa_graph_executor.h | 19 ++++++++++++------- .../details/threaded_ssa_graph_executor.cc | 8 ++++---- .../details/threaded_ssa_graph_executor.h | 19 +++++++++---------- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index d4fbea9d95..297ee92fc3 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( local_scopes_(local_scopes), places_(places), graph_(graph), + fetch_ctxs_(places), pool_(strategy.num_threads_), - prepare_pool_(1), // add one more thread for generate op_deps - fetch_ctxs_(places) { + // add one more thread for generate op_deps + prepare_pool_(1) { for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) { int dep = static_cast<int>(op->NotReadyInputSize()); op_deps_.emplace(op, dep); diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 970298950c..f6d5160e75 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -14,7 +14,9 @@ #pragma once #include <ThreadPool.h> +#include <memory> #include <string> +#include <unordered_map> #include <vector> #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" @@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { const ir::Graph &Graph() const override; private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. ExecutionStrategy strategy_; std::vector<Scope *> local_scopes_; std::vector<platform::Place> places_; @@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { std::unordered_map<OpHandleBase *, int> op_deps_; std::vector<OpHandleBase *> bootstrap_ops_; - ::ThreadPool pool_; - ::ThreadPool prepare_pool_; platform::DeviceContextPool fetch_ctxs_; std::atomic<int> remaining_; + std::future< + std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>> + atomic_op_deps_; + ExceptionHolder exception_; + + ::ThreadPool pool_; + ::ThreadPool prepare_pool_; + void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, OpHandleBase *op, const std::shared_ptr<BlockingQueue<size_t>> &complete_q); void PrepareAtomicOpDeps(); - - std::future< - std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>> - atomic_op_deps_; - ExceptionHolder exception_; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c4254bbadf..c00932a7bd 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes, const std::vector<platform::Place> &places, ir::Graph *graph) : graph_(graph), - pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) - : nullptr), - prepare_pool_(1), local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), - strategy_(strategy) { + strategy_(strategy), + prepare_pool_(1), + pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) + : nullptr) { PrepareOpDeps(); CopyOpDeps(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index b9bccba8fa..1fa5196970 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -63,13 +63,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. ir::Graph *graph_; - std::unique_ptr<::ThreadPool> pool_; - ::ThreadPool prepare_pool_; std::vector<Scope *> local_scopes_; std::vector<platform::Place> places_; platform::DeviceContextPool fetch_ctxs_; ExceptionHolder exception_holder_; + std::unique_ptr<OpDependentData> op_deps_; + std::future<std::unique_ptr<OpDependentData>> op_deps_futures_; + ExecutionStrategy strategy_; + // use std::list because clear(), push_back, and for_each are O(1) + std::list<std::future<void>> run_op_futures_; + ::ThreadPool prepare_pool_; + std::unique_ptr<::ThreadPool> pool_; void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops, OpHandleBase *op_instance) const; @@ -88,14 +95,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { void PrepareOpDeps(); void CopyOpDeps(); - - private: - std::future<std::unique_ptr<OpDependentData>> op_deps_futures_; - - ExecutionStrategy strategy_; - std::unique_ptr<OpDependentData> op_deps_; - // use std::list because clear(), push_back, and for_each are O(1) - std::list<std::future<void>> run_op_futures_; }; } // namespace details From 2632327429ed823cf1a0c2593cfa411fb2f111b9 Mon Sep 17 00:00:00 2001 From: Jacek Czaja <jacek.czaja@intel.com> Date: Thu, 28 Mar 2019 14:27:17 +0100 Subject: [PATCH 18/19] [MKL-DNN] Tensor modifications revert (#16462) * Revert "[MKL-DNN] Fix to crash of Transformer when mkldnn is to be used (#16233)" This reverts commit 13816dd4acdabd21a715b3b1c63fb43cdbac7622. Apart from enabling transformer for MKL-DNN * Revert "- MKL-DNN pooling updated to set_prim_desc" This reverts commit c63f6b20393d8b21b540e2b6091419e584ea5155. Conflicts: paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc * Revert "[MKL-DNN] MKL-DNN specific Tensor modification (#15429)" test=develop This reverts commit dec9cf53c89e0acc605a053b436ba24be68f62c7. * - concat compilation fix - lint test=develop - Lint fixes test=develop - Lint fixes test=develop - Fix Transpose MKLDNN op test=develop --- .../fluid/framework/data_layout_transform.cc | 23 ++++-- paddle/fluid/framework/data_transform.cc | 30 ++------ paddle/fluid/framework/tensor.h | 42 +++-------- paddle/fluid/framework/tensor_util.cc | 5 -- .../mkldnn/elementwise_add_mkldnn_op.cc | 19 +++-- .../operators/mkldnn/activation_mkldnn_op.cc | 24 ++++-- .../operators/mkldnn/batch_norm_mkldnn_op.cc | 36 ++++++--- .../operators/mkldnn/concat_mkldnn_op.cc | 3 +- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 47 ++++++------ .../mkldnn/conv_transpose_mkldnn_op.cc | 3 +- .../mkldnn/gaussian_random_mkldnn_op.cc | 8 +- .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 21 ++++-- .../operators/mkldnn/softmax_mkldnn_op.cc | 8 -- .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 9 ++- .../operators/mkldnn/transpose_mkldnn_op.cc | 26 ++----- paddle/fluid/platform/mkldnn_reuse.h | 73 +++++++++---------- paddle/fluid/platform/mkldnn_utils.h | 69 ------------------ 17 files changed, 172 insertions(+), 274 deletions(-) delete mode 100644 paddle/fluid/platform/mkldnn_utils.h diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 10aa7a5942..72c50518af 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, out_layout = out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims()); std::vector<int> out_tz = in_tz; @@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; + auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); + auto out_format = + platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); + // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - // tempory mem pd fr out , to make reorder - auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(out->dims()), - mkldnn::memory::format::blocked, out_type); - if (in.get_mkldnn_prim_desc() != out_mem_pd) { + if (in_format != out_format) { void* in_data = GetDataFromTensor(in, in_type); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data); - auto out_memory = memory(out_mem_pd, out_data); + auto in_memory = + memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); platform::Reorder(in_memory, out_memory); } else { out->ShareDataWith(in); } out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); #endif } diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index f0203edf05..8287222450 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type, #ifdef PADDLE_WITH_MKLDNN // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur + + auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), + ToMKLDNNFormat(lin)); + out.ShareDataWith(input_tensor); - // TODO(jczaja): Remove that once all mkldnn ops - // are modified to work with mkldnn_blocked - auto mkldnn_fmt = [&](int rank) { - switch (rank) { - case 5: - return mkldnn::memory::format::ncdhw; - case 4: - return mkldnn::memory::format::nchw; - case 3: - return mkldnn::memory::format::ncw; - case 2: - return mkldnn::memory::format::nc; - case 1: - return mkldnn::memory::format::x; - default: - return mkldnn::memory::format::blocked; - } - }; - - auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(out.dims()), - mkldnn_fmt(out.dims().size())); - - out.set_mkldnn_prim_desc(out_mem_pd); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(out_format); #endif } else { // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 88f5b757a8..a3c1063ce9 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -18,6 +18,7 @@ limitations under the License. */ #include <cstring> #include <memory> #include <typeindex> +#include <utility> #include <vector> #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" @@ -27,10 +28,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_utils.h" -#endif - namespace paddle { namespace framework { @@ -41,34 +38,10 @@ class Tensor { #ifdef PADDLE_WITH_MKLDNN public: - // TODO(jczaja): This is depracted and will be removed - inline mkldnn::memory::format format() const { - if (layout_ == DataLayout::kMKLDNN) { - return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format); - } else { - return mkldnn::memory::format::format_undef; - } - } + inline mkldnn::memory::format format() const { return format_; } - // TODO(jczaja): This is depracted and will be removed - inline void set_format( - const mkldnn::memory::format fmt, - mkldnn::memory::data_type data_type = mkldnn::memory::f32) { - mem_pd_ = paddle::platform::create_prim_desc_from_format( - paddle::framework::vectorize2int(dims()), fmt, data_type); - layout_ = DataLayout::kMKLDNN; - } - - inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const { - return mem_pd_; - } - - inline void set_mkldnn_prim_desc( - const mkldnn::memory::primitive_desc& mem_pd) { - // Internally MKL-DNN is just copying (increasing reference counter) - // to shared_ptr. So asignment should be quite cheap - mem_pd_ = mem_pd; - layout_ = DataLayout::kMKLDNN; + inline void set_format(const mkldnn::memory::format format) { + format_ = format; } protected: @@ -76,9 +49,12 @@ class Tensor { * @brief the detail format of memory block which have layout as kMKLDNN * * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, we store memory descriptor + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. */ - mutable mkldnn::memory::primitive_desc mem_pd_; + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; #endif public: diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 5f21dae605..a7f09df491 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -44,11 +44,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } -#ifdef PADDLE_WITH_MKLDNN - if (src.layout() == DataLayout::kMKLDNN) { - dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc()); - } -#endif memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, boost::get<platform::CPUPlace>(src_place), src_ptr, size); } diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 7aaa607f15..6a6741d8fc 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -77,7 +77,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { } else { functor.RunMidWise(n, pre, post); } - z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc()); + z->set_layout(DataLayout::kMKLDNN); + z->set_format(x->format()); } else { PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && x->format() != memory::format::format_undef, @@ -115,8 +116,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd); // create mkldnn memory for dst - auto dst_mem_pd = sum_pd.dst_primitive_desc(); - memory dst_memory = memory(dst_mem_pd, z_data); + memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data); std::vector<primitive::at> inputs; inputs.push_back(srcs[0]); @@ -129,7 +129,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { pipeline.push_back(sum_prim); stream(stream::kind::eager).submit(pipeline).wait(); - z->set_mkldnn_prim_desc(dst_mem_pd); + z->set_layout(DataLayout::kMKLDNN); + z->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); } } }; @@ -150,19 +152,24 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> { auto* out = dout; auto *x = dout, *y = dout; + auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { + in->set_layout(DataLayout::kMKLDNN); + in->set_format(out->format()); + }; + if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { if (dx->dims() == dy->dims()) { auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx); if (dx) { blas.VCOPY(dout->numel(), dout->data<T>(), dx->mutable_data<T>(ctx.GetPlace())); - dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); + set_mkldnn_format(dx, dout); } if (dy) { blas.VCOPY(dout->numel(), dout->data<T>(), dy->mutable_data<T>(ctx.GetPlace())); - dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); + set_mkldnn_format(dy, dout); } } } else { diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 43559940d9..5b7505f3c4 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -96,7 +96,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, std::vector<int> src_tz = framework::vectorize2int(x->dims()); - auto src_format = x->format(); + auto src_format = + src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format(); const std::string key = gethash(src_tz, algorithm); const std::string key_src_data = @@ -126,8 +127,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx, if (p_fwd == nullptr) { // create mkldnn memory for input X + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType<T>(), src_format); auto src_memory = std::shared_ptr<memory>( - new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data))); + new memory({src_md, mkldnn_engine}, to_void_cast(x_data))); // save src_memory to be referred in backward path dev_ctx.SetBlob(key_src_mem, src_memory); @@ -174,7 +177,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, pipeline.push_back(*p_fwd); stream(stream::kind::eager).submit(pipeline).wait(); - y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); + y->set_layout(DataLayout::kMKLDNN); + y->set_format(GetMKLDNNFormat(*dst_memory)); } template <typename T> @@ -192,6 +196,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx, std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims()); + auto diff_y_format = + diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format(); + const std::string key = gethash(diff_dst_tz, algorithm); const std::string key_src_data = key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; @@ -203,8 +210,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem"; const std::string key_fwd_pd = key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd"; - const std::string key_with_layouts = key + std::to_string(*p_src_layout) + - "-" + std::to_string(diff_y->format()); + const std::string key_with_layouts = + key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format); const std::string key_diff_src_mem = key_with_layouts + "@eltwise_diff_src_mem"; const std::string key_diff_dst_mem = @@ -227,8 +234,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx, if (p_grad == nullptr) { // create mkldnn memory for input diff_y + auto diff_dst_md = platform::MKLDNNMemDesc( + diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format); auto diff_dst_memory = std::shared_ptr<memory>( - new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data))); + new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data))); dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory); // retrieve eltwise primitive desc from device context @@ -272,7 +281,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, pipeline.push_back(*p_grad); stream(stream::kind::eager).submit(pipeline).wait(); - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format(GetMKLDNNFormat(*diff_src_memory)); } template <typename T, mkldnn::algorithm algorithm> diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index 04e45d4853..bddca232e6 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -206,14 +206,17 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; // create mkldnn memory from input x tensor + mkldnn::memory::format input_format = + platform::MKLDNNFormatForSize(src_tz.size(), x->format()); // keys for backward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, global_stats, x->format(), + src_tz, epsilon, flags, global_stats, input_format, ctx.op().Output("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; - auto user_src_md = x->get_mkldnn_prim_desc().desc(); + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType<T>(), input_format); // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>; @@ -227,8 +230,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, key); - auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(), - to_void_cast(x_data)); + auto src_memory = + handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data)); // crate mkldnn memory for weights(scale/shift) auto scaleshift_memory = @@ -262,7 +265,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { variance_memory, false); } - y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); + y->set_layout(DataLayout::kMKLDNN); + y->set_format(platform::GetMKLDNNFormat(*dst_memory)); std::vector<mkldnn::primitive> pipeline; pipeline.push_back(*batch_norm_p); @@ -332,6 +336,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>; + mkldnn::memory::format dst_format = + platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); + mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); @@ -339,14 +346,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { // keys from forward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, false, x->format(), + src_tz, epsilon, flags, false, input_format, ctx.op().Input("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; // keys for primitives reuse const std::string key_with_hash = key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, - x->format()); + input_format); const std::string key_batch_norm_bwd_p = key_with_hash + "@batch_norm_bwd_p"; const std::string key_batch_norm_src_mem_p = @@ -366,8 +373,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { primitive reorder_diff_dst; bool is_diff_dst_reordered = false; - auto user_diff_dst_memory = - memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)); + auto user_diff_dst_memory = memory( + {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, + to_void_cast(diff_y_data)); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -451,7 +459,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory); // set layout/format of output tensors - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } else { // primitives already exist UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); @@ -476,7 +487,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { } // set layout/format of output tensors - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } // execute optional reorder and batch_norm backward primitive diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 97387af92f..50fe2e6e4c 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -210,7 +210,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> { stream(stream::kind::eager).submit({*concat_p}).wait(); - output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetDstMemFormat(*concat_pd)); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 8d96ae7e42..5e4d79f1c3 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -96,8 +96,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr; auto* output = ctx.Output<Tensor>("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, @@ -144,19 +148,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { std::vector<primitive> pipeline; - // For convolution with groups we need to recreate primitive descriptor - // as Paddle tensor is not having group dims while mkldnn treats - // group as another dimensions - mkldnn::memory::primitive_desc user_weights_mpd = - filter->get_mkldnn_prim_desc(); - if (g > 1) { - mkldnn::memory::format weights_format = - GetWeightsFormat(filter->format(), g, is_conv3d); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format); - user_weights_mpd = - mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine); - } + auto src_format = input->format(); + mkldnn::memory::format weights_format = + GetWeightsFormat(filter->format(), g, is_conv3d); + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType<T>(), src_format); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -166,7 +165,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - mkldnn::memory::format weights_format = mkldnn::memory::format::any; + weights_format = mkldnn::memory::format::any; // Check the format for user's special output if (chosen_memory_format != mkldnn::memory::format::any) { if (is_conv3d) { @@ -206,10 +205,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = handler.AcquireSrcMemory( - input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data)); + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data)); auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_mpd, to_void_cast<T>(filter_data)); + user_weights_md, to_void_cast<T>(filter_data)); // create reorder primitive if the input format is not the preferred one auto src_memory_p = @@ -282,7 +281,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); } void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr<bool>("is_test"); @@ -948,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { // push primitive to stream and wait until it's executed pipeline.push_back(*conv_bwd_weights_p); - auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc(); - filter_grad->set_mkldnn_prim_desc(filter_grad_mpd); + filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p)); } if (input_grad) { @@ -972,7 +972,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { pipeline.push_back(*conv_bwd_data_p); - input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc()); + input_grad->set_layout(DataLayout::kMKLDNN); + input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); } stream(stream::kind::eager).submit(pipeline).wait(); } diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 79a0c5c768..317d4cebe2 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -221,7 +221,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { pipeline.push_back(*conv_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } private: diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index d01e8dbf4c..76b00b396c 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -42,12 +42,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> { // The format of output is set as the mkldnn's format // TODO(@mozga-intel) The format of matrix sets inside the another layers. - // TODO(jczaja): Remove this hack after checking performance on block layout - - auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(tensor->dims()), - mkldnn::memory::format::oihw); - tensor->set_mkldnn_prim_desc(tensor_mem_pd); + tensor->set_layout(DataLayout::kMKLDNN); + tensor->set_format(mkldnn::memory::format::oihw); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 4ff27ab122..097ba01d40 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -81,7 +81,10 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { auto e_mid = framework::EigenTensor<T, 4>::From(*mid); e_mid = e_mid.constant(k); - auto src_md = x->get_mkldnn_prim_desc().desc(); + auto dims = paddle::framework::vectorize2int(x->dims()); + + auto src_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, x->format()); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -91,7 +94,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { beta, k}; - auto src_memory_pd = x->get_mkldnn_prim_desc(); + auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -108,15 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { src_memory->set_data_handle( static_cast<void*>(const_cast<T*>(input_data))); - auto dst_memory_pd = forward_pd->dst_primitive_desc(); - auto dst_memory = - mkldnn::memory(dst_memory_pd, static_cast<void*>(output_data)); + auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), + static_cast<void*>(output_data)); auto workspace_memory = insert_to_context<mkldnn::memory>( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); - out->set_mkldnn_prim_desc(dst_memory_pd); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -124,12 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; - auto dst_memory_pd = forward_pd.dst_primitive_desc(); auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), static_cast<void*>(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); - out->set_mkldnn_prim_desc(dst_memory_pd); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } } }; diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 0ce5522194..dc1176f084 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -158,14 +158,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> { auto softmax_p = handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); - // We cannot use softmax_dst_memory_p to get prim desc as - // it contains flattened dims (2D) while output tensor can - // have 2,3,4+ dims - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(output->dims()), - mkldnn::memory::format::blocked); - output->set_mkldnn_prim_desc(output_mem_pd); - std::vector<primitive> pipeline{ *(static_cast<softmax_forward::primitive*>(softmax_p.get()))}; stream(stream::kind::eager).submit(pipeline).wait(); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index aef5b7d431..6f64157b64 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -106,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> { memory::desc(dst_tz, memory::data_type::f32, memory::format::any); auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); - auto dst_mem_pd = sum_pd.dst_primitive_desc(); + std::shared_ptr<memory> dst_mem; if (in_place) { - dst_mem.reset(new memory(dst_mem_pd)); + dst_mem.reset(new memory(sum_pd.dst_primitive_desc())); } else { - dst_mem.reset(new memory(dst_mem_pd, output_data)); + dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data)); } std::vector<mkldnn::primitive::at> inputs; for (size_t i = 0; i < srcs_mem.size(); ++i) { @@ -136,7 +136,8 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> { if (in_place) pipeline.push_back(reorder_prim); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_mem_pd); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); } else { // Fallback to naive version // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support SumKernel<CPUDeviceContext, T> reference_kernel; diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index 4debc7ca5e..95cee806ac 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { mkldnn_engine, key); auto transpose_src_memory_p = handler.AcquireSrcMemory( - input->get_mkldnn_prim_desc(), platform::to_void_cast<T>(input_data)); + input->format(), platform::to_void_cast<T>(input_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(output, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -62,14 +62,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - // Transpose did change logical dimensions of Tensor, but reorder does not. - // Reorder does change only physical layout eg. format , strides - // so we need to create new primitive descriptor with changed logical layout - // so it match output shape - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(output->dims()), - mkldnn::memory::format::blocked); - output->set_mkldnn_prim_desc(output_mem_pd); + output->set_layout(DataLayout::kNCHW); + output->set_format(mkldnn::memory::format::format_undef); } }; @@ -134,9 +128,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, mkldnn_engine, key); - auto transpose_src_memory_p = - handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(), - platform::to_void_cast<T>(out_grad_data)); + auto transpose_src_memory_p = handler.AcquireSrcMemory( + out_grad->format(), platform::to_void_cast<T>(out_grad_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(x_grad, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -145,15 +138,6 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { std::vector<mkldnn::primitive> pipeline; pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - - // Transpose did change logical dimensions of Tensor, but reorder does not. - // Reorder does change only physical layout eg. format , strides - // so we need to create new primitive descriptor with changed logical layout - // so it match output shape - auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(x_grad->dims()), - mkldnn::memory::format::blocked); - x_grad->set_mkldnn_prim_desc(x_grad_mem_pd); } }; diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 4fa6774f02..ecaad4ec07 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include <memory> #include <string> #include <vector> #include "paddle/fluid/framework/data_layout_transform.h" @@ -39,45 +40,6 @@ class MKLDNNHandler { return this->AcquireMemory(md, ptr, "@user_src_mem_p"); } - // TODO(jczaja): extract common part and make AcquireMemory - std::shared_ptr<mkldnn::memory> AcquireSrcMemory( - const mkldnn::memory::primitive_desc& mpd, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared<mkldnn::memory>(mpd, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - - std::shared_ptr<mkldnn::memory> AcquireWeightsMemory( - const mkldnn::memory::primitive_desc& mpd, void* ptr) { - auto local_key = key_ + "@user_weights_mem_p"; - auto mem_p = - std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared<mkldnn::memory>(mpd, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - std::shared_ptr<mkldnn::memory> AcquireWeightsMemory( const mkldnn::memory::desc& md, void* ptr, user_function custom_func = {}) { @@ -315,7 +277,37 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), - axis_(axis) {} + axis_(axis), + logical_axis_(dims.size(), 0) {} + + std::shared_ptr<mkldnn::memory> AcquireSrcMemory( + const mkldnn::memory::format& fmt, void* ptr) { + auto local_key = key_ + "@user_src_mem_p"; + auto mem_p = + std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + " find mem primitive in device context"); + if (mem_p == nullptr) { + // Make memory descriptor using input format, unless it + // cannot be trusted (nchw) then make up memory fmt manually + for (size_t i = 0; i < logical_axis_.size(); ++i) { + logical_axis_[i] = i; + } + auto src_md = fmt != mkldnn::memory::format::nchw + ? platform::MKLDNNMemDesc( + dims_, platform::MKLDNNGetDataType<float>(), fmt) + : Axis2MemoryDesc(dims_, logical_axis_); + mem_p = std::make_shared<mkldnn::memory>( + mkldnn::memory::primitive_desc{src_md, engine_}, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output, platform::Place place) { @@ -400,6 +392,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { private: std::vector<int> dims_; std::vector<int> axis_; + std::vector<int> logical_axis_; }; template <class forward_t, class backward_data_t, class backward_weights_t> diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h deleted file mode 100644 index 8c511f97d1..0000000000 --- a/paddle/fluid/platform/mkldnn_utils.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include <mkldnn.h> -#include <string> - -namespace paddle { -namespace platform { - -inline mkldnn::memory::primitive_desc create_prim_desc_from_dims( - const std::vector<int>& ltz, mkldnn::memory::format fmt, - mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) { - mkldnn_memory_desc_t mem_fmt; - - mem_fmt.primitive_kind = mkldnn_memory; - mem_fmt.ndims = ltz.size(); - for (unsigned int i = 0; i < ltz.size(); ++i) { - mem_fmt.dims[i] = ltz[i]; // logical dimensions (nchw format, - // regardless physical layout) - } - mem_fmt.data_type = static_cast<mkldnn_data_type_t>(data_type); - mem_fmt.format = static_cast<mkldnn_memory_format_t>(fmt); - - unsigned int total_stride = 1; - for (int i = ltz.size() - 1; i >= 0; --i) { - mem_fmt.layout_desc.blocking.padding_dims[i] = - ltz[i]; // logical dimensions (nchw format, regardless physical - // layout) - mem_fmt.layout_desc.blocking.block_dims[i] = 1; - mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset - mem_fmt.layout_desc.blocking.strides[0][i] = total_stride; - mem_fmt.layout_desc.blocking.strides[1][i] = 1; - total_stride *= ltz[i]; - } - mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset - - auto& pool = platform::DeviceContextPool::Instance(); - auto place = paddle::platform::CPUPlace(); - auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place)); - auto& cpu_engine = dev_ctx->GetEngine(); - return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine); -} - -inline mkldnn::memory::primitive_desc create_prim_desc_from_format( - const std::vector<int>& ltz, const mkldnn::memory::format format, - const mkldnn::memory::data_type data_type) { - auto md = mkldnn::memory::desc({ltz}, data_type, format); - auto& pool = platform::DeviceContextPool::Instance(); - auto place = paddle::platform::CPUPlace(); - auto dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place)); - PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device"); - auto& cpu_engine = dev_ctx->GetEngine(); - return mkldnn::memory::primitive_desc(md, cpu_engine); -} - -} // namespace platform -} // namespace paddle From 1096746cbfbb2a5cba835e284b3054f66db0ea85 Mon Sep 17 00:00:00 2001 From: chengduo <zhaochengduo@baidu.com> Date: Thu, 28 Mar 2019 09:59:54 -0500 Subject: [PATCH 19/19] Fuse Adam And SGD ops (#15933) * fuse optimizer --- paddle/fluid/framework/details/CMakeLists.txt | 9 +- .../alloc_continuous_space_for_grad_pass.cc | 48 ++-- .../framework/details/broadcast_op_handle.cc | 13 +- .../fluid/framework/details/build_strategy.cc | 52 +++- .../fluid/framework/details/build_strategy.h | 3 +- .../framework/details/fuse_adam_op_pass.cc | 199 +++++++++++++++ .../framework/details/fuse_adam_op_pass.h | 55 ++++ .../details/fuse_optimizer_op_pass.cc | 240 ++++++++++++++++++ .../details/fuse_optimizer_op_pass.h | 75 ++++++ .../framework/details/fuse_sgd_op_pass.cc | 74 ++++++ .../framework/details/fuse_sgd_op_pass.h | 50 ++++ .../details/fused_all_reduce_op_handle.cc | 29 ++- .../details/multi_devices_graph_pass.h | 5 +- .../framework/details/multi_devices_helper.h | 26 +- paddle/fluid/framework/tensor.cc | 2 +- paddle/fluid/framework/tensor.h | 2 +- .../operators/alloc_continuous_space_op.cc | 45 +++- paddle/fluid/pybind/pybind.cc | 9 + .../unittests/parallel_executor_test_base.py | 2 + .../test_alloc_continuous_space_op.py | 43 +++- .../unittests/test_fuse_optimizer_pass.py | 135 ++++++++++ .../unittests/test_parallel_executor_crf.py | 115 +++++---- .../test_parallel_executor_dry_run.py | 17 +- 23 files changed, 1101 insertions(+), 147 deletions(-) create mode 100644 paddle/fluid/framework/details/fuse_adam_op_pass.cc create mode 100644 paddle/fluid/framework/details/fuse_adam_op_pass.h create mode 100644 paddle/fluid/framework/details/fuse_optimizer_op_pass.cc create mode 100644 paddle/fluid/framework/details/fuse_optimizer_op_pass.h create mode 100644 paddle/fluid/framework/details/fuse_sgd_op_pass.cc create mode 100644 paddle/fluid/framework/details/fuse_sgd_op_pass.h create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 046ec6978a..d4939779a2 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) + cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) +cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) +cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) @@ -104,5 +107,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass - fuse_relu_depthwise_conv_pass - memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass) + fuse_relu_depthwise_conv_pass + memory_optimize_pass lock_free_optimize_pass + alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass + fuse_adam_op_pass fuse_sgd_op_pass) diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc index e195e93fb8..8e8258ffb1 100644 --- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc +++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" + DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB "fuse_parameter_memory_size is up limited memory size " "of one group parameters' gradient which is the input " @@ -105,20 +106,29 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { auto ele_dtype = iter->second->Var()->GetDataType(); if (dtype == kDefaultDtype) { dtype = ele_dtype; - PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype); + PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype, + "The data type should not be bool."); } - PADDLE_ENFORCE_EQ(ele_dtype, dtype); + PADDLE_ENFORCE_EQ(ele_dtype, dtype, + "The data type of input is not consistent."); } - // Create the fused variable name. + // Create a FusedVarsSet to avoid duplicating names for fused_var in other + // pass. if (!result.Has(kFusedVars)) { result.Set(kFusedVars, new FusedVars); } - const std::string prefix(kFusedVarNamePrefix); - // The fused_var_name should be unique. - auto fused_var_name = prefix + "GRAD@" + params_grads[0].second; + // the kFusedGrads is used be fuse_optimizer_op_pass. + result.Set(kFusedGrads, new FusedGrads); + + // the fused_var_name should be unique, so it appends + // params_grads.begin()->second. + auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" + + params_grads.begin()->second; + result.Get<FusedGrads>(kFusedGrads) = fused_var_name; auto &fused_var_set = result.Get<FusedVars>(kFusedVars); - PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0, + "%s is duplicate in FusedVars.", fused_var_name); fused_var_set.insert(fused_var_name); InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, @@ -295,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { return type == proto::VarType::LOD_TENSOR; } - void AppendAllocSpaceForVarsOp(const std::vector<std::string> ¶ms_name, - const std::vector<std::string> &grads_name, - const std::string &fused_var_name, - BlockDesc *global_block) const { - auto op_desc = global_block->AppendOp(); - op_desc->SetType("alloc_continuous_space"); - op_desc->SetInput("Input", params_name); - op_desc->SetOutput("Output", grads_name); - op_desc->SetOutput("FusedOutput", {fused_var_name}); - } - void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const { try { @@ -358,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } } + // Alloc continuous space for vars. std::vector<std::string> grads_name; std::vector<std::string> params_name; grads_name.reserve(params_grads.size()); @@ -370,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, program_desc.MutableBlock(0)); - // Run Only Once Programs for (size_t i = 0; i < local_scopes.size(); ++i) { for (auto &op_desc : program_desc.Block(0).AllOps()) { auto op = OpRegistry::CreateOp(*op_desc); @@ -378,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } } } + + void AppendAllocSpaceForVarsOp(const std::vector<std::string> ¶ms_name, + const std::vector<std::string> &grads_name, + const std::string &fused_var_name, + BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", params_name); + op_desc->SetOutput("Output", grads_name); + op_desc->SetOutput("FusedOutput", {fused_var_name}); + } }; } // namespace details diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index fdff83b928..752c932a21 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() { if (places_.size() == 1) return; // The input and output may have dummy vars. - VarHandle *in_var_handle; - { - auto in_var_handles = DynamicCast<VarHandle>(inputs_); - PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, - "The number of input should be one."); - in_var_handle = in_var_handles[0]; - } - + auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, + "The number of input should be one."); PADDLE_ENFORCE_EQ( out_var_handles.size(), places_.size(), "The number of output should equal to the number of places."); + VarHandle *in_var_handle = in_var_handles[0]; + WaitInputVarGenerated(); std::vector<const Scope *> var_scopes; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 078403f30f..df69b11ec6 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include <glog/logging.h> #include <memory> #include <utility> - #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" @@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("inplace_pass"); } - if (strategy.fuse_elewise_add_act_ops_) { + if (strategy_.fuse_elewise_add_act_ops_) { VLOG(10) << "Add fuse_elewise_add_act_pass"; AppendPass("fuse_elewise_add_act_pass"); } // for single card training, fuse_all_reduce_ops is unnecessary. // alloc_continuous_space_for_grad_pass should be before of MultiDevPass. - if (strategy.fuse_all_reduce_ops_) { + if (strategy_.fuse_all_reduce_ops_) { VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; AppendPass("alloc_continuous_space_for_grad_pass"); } + if (strategy_.fuse_all_optimizer_ops_) { + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce || + strategy_.is_distribution_) { + VLOG(3) + << "Currently, fuse_all_optimizer_ops only works under AllReduce " + "mode."; + strategy_.fuse_all_optimizer_ops_ = false; + } else { + VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; + AppendPass("alloc_continuous_space_for_grad_pass"); + // NOTE: fuse_all_xx_ops will count the number of xx operator first, + // if the number is zero, fuse_all_reduce_ops will do nothing. + // Currently, only one type of optimization algorithm can be fused. + VLOG(10) << "Add fuse_adam_op_pass"; + AppendPass("fuse_adam_op_pass"); + VLOG(10) << "Add fuse_sgd_op_pass"; + AppendPass("fuse_sgd_op_pass"); + } + } + // Add a graph viz pass to record a graph. if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph"); viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path)); } @@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // the de-fact IR, any reuse on Graph is meaningless. // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. - if (strategy.memory_optimize_) { + if (strategy_.memory_optimize_) { VLOG(10) << "Add memory_optimize_pass"; AppendPass("memory_optimize_pass"); } - AppendMultiDevPass(strategy); + AppendMultiDevPass(strategy_); - if (strategy.fuse_all_reduce_ops_) { + if (strategy_.fuse_all_reduce_ops_) { // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator // first, if the number is zero, fuse_all_reduce_ops will do nothing. VLOG(10) << "Add fuse_all_reduce_op_pass"; @@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("all_reduce_deps_pass"); } - if (SeqOnlyAllReduceOps(strategy)) { + if (SeqOnlyAllReduceOps(strategy_)) { VLOG(10) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } @@ -165,7 +184,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Convert graph to run on multi-devices. void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass = nullptr; - if (strategy_.is_distribution_) { + if (strategy.is_distribution_) { VLOG(10) << "Add dist_multi_devices_pass"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { @@ -235,17 +254,22 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kNCCLCtxs); pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx); #endif - } else if (pass->Type() == "fuse_all_reduce_op_pass") { + } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" || + pass->Type() == "fuse_adam_op_pass" || + pass->Type() == "fuse_sgd_op_pass" || + pass->Type() == "fuse_all_reduce_op_pass") { pass->Erase(kPlaces); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places); pass->Erase(kLocalScopes); pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes, &local_scopes); + if (pass->Type() == "fuse_all_reduce_op_pass") { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - pass->Erase(kNCCLCtxs); - pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx); + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx); #endif + } } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") { pass->Erase(kPlaces); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places); @@ -294,4 +318,6 @@ USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(graph_to_program_pass); +USE_PASS(fuse_adam_op_pass); +USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_all_reduce_op_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 9587a6f0f9..85f328b7c4 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -18,7 +18,6 @@ #include <string> #include <utility> #include <vector> - #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -76,6 +75,8 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; + bool fuse_all_optimizer_ops_{false}; + bool fuse_all_reduce_ops_{false}; bool fuse_relu_depthwise_conv_{false}; diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc new file mode 100644 index 0000000000..0ef75e3192 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_adam_op_pass.h" +#include <algorithm> +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } + +const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const { + return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; +} + +void FuseAdamOpPass::FuseOptimizerOps( + const std::unordered_map<std::string, std::vector<std::string>> + &aux_var_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const { + FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), + adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), + adam_ops, graph); +} + +void FuseAdamOpPass::FuseAdamOps( + const std::unordered_map<std::string, std::vector<std::string>> &vars_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0)); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get<int>( + adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1")); + float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2")); + float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon")); + bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode")); + int64_t min_row_size_to_use_multithread = boost::get<int64_t>( + adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread")); + for (auto &adam_op : adam_ops) { + PADDLE_ENFORCE_EQ(beta1, + boost::get<float>(adam_op->Op()->GetAttr("beta1"))); + PADDLE_ENFORCE_EQ(beta2, + boost::get<float>(adam_op->Op()->GetAttr("beta2"))); + PADDLE_ENFORCE_EQ(epsilon, + boost::get<float>(adam_op->Op()->GetAttr("epsilon"))); + PADDLE_ENFORCE_EQ(lazy_mode, + boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode"))); + PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread, + boost::get<int64_t>(adam_op->Op()->GetAttr( + "min_row_size_to_use_multithread"))); + PADDLE_ENFORCE_EQ(op_role, boost::get<int>(adam_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + VLOG(10) << "Insert adam to graph "; + OpDesc adam_desc(adam_ops[0]->Op()->Block()); + adam_desc.SetType("adam"); + adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); + adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); + adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); + adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); + adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); + + adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); + adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); + adam_desc.SetAttr("beta1", beta1); + adam_desc.SetAttr("beta2", beta2); + adam_desc.SetAttr("epsilon", epsilon); + adam_desc.SetAttr("lazy_mode", lazy_mode); + adam_desc.SetAttr("min_row_size_to_use_multithread", + min_row_size_to_use_multithread); + adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto adam_node = graph->CreateOpNode(&adam_desc); + + InserInputAndOutputForOptOps(adam_ops, adam_node); +} + +void FuseAdamOpPass::FuseScaleOps(const std::vector<std::string> &beta_name, + const std::string &fused_var_name, + const std::vector<ir::Node *> &adam_ops, + ir::Graph *graph) const { + PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size()); + const std::string scale_op_name = "scale"; + + // Get the scale_ops of dealing the adam's beta var. + std::vector<ir::Node *> scale_ops; + scale_ops.reserve(beta_name.size()); + for (size_t i = 0; i < adam_ops.size(); ++i) { + auto &beta_1_pow_name = beta_name[i]; + auto beta_pow_iter = std::find_if( + adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(), + [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool { + return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name; + }); + PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end()); + + auto beta_pow_node = *beta_pow_iter; + auto scale_op_iter = std::find_if( + beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(), + [&scale_op_name](ir::Node *op_node) -> bool { + return op_node->Op() && op_node->Op()->Type() == scale_op_name; + }); + PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end()); + + scale_ops.emplace_back(*scale_op_iter); + } + PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size()); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get<int>( + scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale")); + float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias")); + bool bias_after_scale = + boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale")); + for (auto &scale_op : scale_ops) { + PADDLE_ENFORCE_EQ(scale, + boost::get<float>(scale_op->Op()->GetAttr("scale"))); + PADDLE_ENFORCE_EQ(bias, boost::get<float>(scale_op->Op()->GetAttr("bias"))); + PADDLE_ENFORCE_EQ( + bias_after_scale, + boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale"))); + PADDLE_ENFORCE_EQ(op_role, boost::get<int>(scale_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + VLOG(10) << "Insert fused scale to graph."; + OpDesc scale_desc(scale_ops[0]->Op()->Block()); + scale_desc.SetType("scale"); + scale_desc.SetInput("X", {fused_var_name}); + scale_desc.SetOutput("Out", {fused_var_name}); + scale_desc.SetAttr("scale", scale); + scale_desc.SetAttr("bias", bias); + scale_desc.SetAttr("bias_after_scale", bias_after_scale); + scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + auto scale_node = graph->CreateOpNode(&scale_desc); + + for (auto scale_op : scale_ops) { + // set inputs + scale_node->inputs.insert(scale_node->inputs.begin(), + scale_op->inputs.begin(), scale_op->inputs.end()); + for (auto &input : scale_op->inputs) { + std::replace(input->outputs.begin(), input->outputs.end(), scale_op, + scale_node); + } + // set outputs + scale_node->outputs.insert(scale_node->outputs.begin(), + scale_op->outputs.begin(), + scale_op->outputs.end()); + for (auto &output : scale_op->outputs) { + std::replace(output->inputs.begin(), output->inputs.end(), scale_op, + scale_node); + } + } + + // Delete scale_ops + for (auto &scale_op : scale_ops) { + graph->RemoveNode(scale_op); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.h b/paddle/fluid/framework/details/fuse_adam_op_pass.h new file mode 100644 index 0000000000..5866c37552 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseAdamOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const; + + virtual const std::vector<std::string> GetAuxiliaryVarNames() const; + + // Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow" + virtual void FuseOptimizerOps( + const std::unordered_map<std::string, std::vector<std::string>> &vars_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const; + + void FuseAdamOps( + const std::unordered_map<std::string, std::vector<std::string>> &vars_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const; + + void FuseScaleOps(const std::vector<std::string> &aux_var_set, + const std::string &fused_var_name, + const std::vector<ir::Node *> &adam_ops, + ir::Graph *graph) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc new file mode 100644 index 0000000000..b49f095d42 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include <algorithm> +#include <unordered_set> +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { + ir::Graph &result = *graph; + + auto &places = Get<const std::vector<platform::Place>>(kPlaces); + auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes); + + const std::string fuse_op_type = GetOpType(); + const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames(); + + // Step 1: Get the specified op and auxiliary variables. + std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result); + std::unordered_map<std::string, std::vector<std::string>> aux_var_set; + std::vector<ir::Node *> opt_ops; + for (auto &node : topo_nodes) { + GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops, + &aux_var_set); + } + + VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size(); + if (opt_ops.size() == 0) { + return; + } + + if (result.Has(kFusedOptType)) { + VLOG(10) + << "Currently only support fusing one type optimizer op. Has fused " + << result.Get<FusedOptType>(kFusedOptType); + return; + } else { + result.Set(kFusedOptType, new FusedOptType); + } + result.Get<FusedOptType>(kFusedOptType) = fuse_op_type; + + // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be + // initialized in scopes before execution. + if (!result.Has(kFusedVars)) { + result.Set(kFusedVars, new FusedVars); + } + std::unordered_map<std::string, std::string> fused_vars_name; + fused_vars_name.reserve(aux_var_names.size() + 1); + auto &fused_var_set = result.Get<FusedVars>(kFusedVars); + const std::string prefix(kFusedVarNamePrefix); + // NOTE: the fused_var_name should be unique. + for (auto &var_name : aux_var_names) { + auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + + aux_var_set[var_name][0]; + VLOG(10) << fused_var_name; + fused_vars_name.emplace(var_name, fused_var_name); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + fused_var_set.insert(fused_var_name); + } + + // Step 3: Get the fused Gradient's name + auto ¶ms_grads = result.Get<ParamsAndGrads>(kParamsAndGrads); + if (!result.Has(kFusedGrads)) { + PADDLE_THROW( + "The alloc_continuous_space_for_grad_pass should be called before this " + "pass."); + } + auto &fused_grad = result.Get<FusedGrads>(kFusedGrads); + auto &fused_vars = result.Get<FusedVars>(kFusedVars); + auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); + PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); + fused_vars_name.emplace("Grad", fused_grad); + + // Step 4: Sort the parameters and auxiliary variables according + // to parameters' name to make variables' name correspond correctly. + PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads."); + PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), + "The size of params_grads and aux_var_set are not equal."); + SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); + + // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. + // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. + InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, + aux_var_set, fused_vars_name); + + // Step 6: Fuse optimizer Ops and Scale Ops + FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); + + // Step 7: Remove optimizer Ops + for (auto &opt_op : opt_ops) { + graph->RemoveNode(opt_op); + } +} + +void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( + const std::vector<platform::Place> &places, + const std::vector<Scope *> &local_scopes, + const std::vector<std::string> &aux_var_names, + const std::unordered_map<std::string, std::vector<std::string>> + &aux_var_set, + const std::unordered_map<std::string, std::string> &fused_vars_name) const { + VLOG(10) << "Init FusedVars."; + // Alloc parameters and auxiliary vars in the respective scope. + size_t idx = local_scopes.size(); + for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); + ++iter, --idx) { + auto &scope = *iter; + for (auto &var_name : aux_var_names) { + auto fused_var_name = fused_vars_name.at(var_name); + VLOG(10) << "Init " << fused_var_name; + PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, + "%s has exist in scope[%d]", fused_var_name, idx); + scope->Var(fused_var_name)->GetMutable<LoDTensor>(); + } + } + + ProgramDesc program_desc; + auto *global_block = program_desc.MutableBlock(0); + for (auto &var_name : aux_var_names) { + AppendAllocContinuousSpace(aux_var_set.at(var_name), + fused_vars_name.at(var_name), true, + global_block); + } + + for (size_t i = 0; i < local_scopes.size(); ++i) { + for (auto &op_desc : global_block->AllOps()) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_scopes[i], places[i]); + } + } +} + +void FuseOptimizerOpPass::SortParametersAndAuxVars( + const std::vector<std::pair<std::string, std::string>> ¶ms_grads, + std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set, + std::vector<ir::Node *> *ops) const { + PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast<size_t>(0)); + auto ¶m_vec = aux_vars_set->at("Param"); + + std::vector<size_t> param_sort_idx; + param_sort_idx.reserve(param_vec.size()); + + for (auto &p_g : params_grads) { + auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first); + PADDLE_ENFORCE(iter != param_vec.end()); + auto idx = std::distance(param_vec.begin(), iter); + param_sort_idx.emplace_back(idx); + } + + for (auto &aux_vars : *aux_vars_set) { + std::vector<std::string> sorted_vars; + sorted_vars.reserve(aux_vars.second.size()); + for (size_t i = 0; i < aux_vars.second.size(); ++i) { + sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i])); + } + std::swap(aux_vars.second, sorted_vars); + + std::stringstream out; + for (auto &var_name : aux_vars.second) { + out << var_name << " "; + } + VLOG(10) << aux_vars.first << ": " << out.str(); + } + + std::vector<ir::Node *> sorted_ops; + sorted_ops.reserve(ops->size()); + for (size_t i = 0; i < ops->size(); ++i) { + sorted_ops.emplace_back(ops->at(param_sort_idx[i])); + } + std::swap(*ops, sorted_ops); +} + +void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( + const std::string &op_type, const std::vector<std::string> &aux_vars_name, + ir::Node *node, std::vector<ir::Node *> *ops, + std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) + const { + if (node->Op()->Type() != op_type) return; + + for (auto &var_n : aux_vars_name) { + auto arg_names = node->Op()->Input(var_n); + PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1)); + (*aux_args_name)[var_n].emplace_back(arg_names[0]); + VLOG(10) << var_n << ", " << arg_names[0]; + } + ops->emplace_back(node); +} + +void FuseOptimizerOpPass::AppendAllocContinuousSpace( + const std::vector<std::string> &args, const std::string &out_arg, + bool copy_data, BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", args); + op_desc->SetOutput("Output", args); + op_desc->SetOutput("FusedOutput", {out_arg}); + op_desc->SetAttr("copy_data", copy_data); + op_desc->SetAttr("check_name", true); +} + +void FuseOptimizerOpPass::InserInputAndOutputForOptOps( + const std::vector<ir::Node *> &opt_ops, ir::Node *opt_node) const { + std::unordered_set<ir::Node *> inputs; + std::unordered_set<ir::Node *> outputs; + for (auto opt_op : opt_ops) { + // set inputs + inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end()); + for (auto &input : opt_op->inputs) { + replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node); + } + // set outputs + outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end()); + for (auto &output : opt_op->outputs) { + replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node); + } + } + opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(), + inputs.end()); + opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(), + outputs.end()); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h new file mode 100644 index 0000000000..0240f1594d --- /dev/null +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h @@ -0,0 +1,75 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <memory> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseOptimizerOpPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override; + + protected: + virtual void SortParametersAndAuxVars( + const std::vector<std::pair<std::string, std::string>> ¶ms_grads, + std::unordered_map<std::string, std::vector<std::string>> *aux_var_set, + std::vector<ir::Node *> *ops) const; + + void InserInputAndOutputForOptOps(const std::vector<ir::Node *> &opt_ops, + ir::Node *opt_node) const; + + private: + virtual const std::string GetOpType() const = 0; + + virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0; + + virtual void FuseOptimizerOps( + const std::unordered_map<std::string, std::vector<std::string>> &vars_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0; + + void GetSpecifiedOpsAndVars( + const std::string &op_type, const std::vector<std::string> &aux_vars_name, + ir::Node *node, std::vector<ir::Node *> *ops, + std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) + const; + + void AppendAllocContinuousSpace(const std::vector<std::string> &args, + const std::string &out_arg, bool copy_data, + BlockDesc *global_block) const; + + void InitFusedVarsAndAllocSpaceForVars( + const std::vector<platform::Place> &places, + const std::vector<Scope *> &local_scopes, + const std::vector<std::string> &aux_var_names, + const std::unordered_map<std::string, std::vector<std::string>> + &aux_var_set, + const std::unordered_map<std::string, std::string> &fused_vars_name) + const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc new file mode 100644 index 0000000000..f91c21e3cc --- /dev/null +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h" +#include <algorithm> +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } + +const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const { + return {"Param"}; +} + +void FuseSgdOpPass::FuseOptimizerOps( + const std::unordered_map<std::string, std::vector<std::string>> + &aux_var_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const { + FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph); +} + +void FuseSgdOpPass::FuseSgdOps( + const std::unordered_map<std::string, std::vector<std::string>> &vars_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0)); + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + int op_role = boost::get<int>( + sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + VLOG(10) << "Insert sgd to graph "; + // Add fused scale + OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); + Sgd_desc.SetType("sgd"); + Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); + Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); + + // NOTE: multi_devices_pass requires that every op should have a role. + Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto sgd_node = graph->CreateOpNode(&Sgd_desc); + + InserInputAndOutputForOptOps(sgd_ops, sgd_node); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.h b/paddle/fluid/framework/details/fuse_sgd_op_pass.h new file mode 100644 index 0000000000..b3aa6a203b --- /dev/null +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseSgdOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const; + + virtual const std::vector<std::string> GetAuxiliaryVarNames() const; + + // Fuse Sgd Ops + virtual void FuseOptimizerOps( + const std::unordered_map<std::string, std::vector<std::string>> &vars_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const; + + void FuseSgdOps( + const std::unordered_map<std::string, std::vector<std::string>> &vars_set, + const std::unordered_map<std::string, std::string> &fused_vars_name, + const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 644cd4e150..a57d670f11 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -24,6 +24,19 @@ namespace paddle { namespace framework { namespace details { +// Note(zcd): Addresses should be aligned, otherwise, the results may have +// diff. +static size_t Alignment(size_t size, const platform::Place &place) { + // Allow to allocate the minimum chunk size is 4 KB. + size_t alignment = 1 << 12; + if (platform::is_gpu_place(place)) { + // Allow to allocate the minimum chunk size is 256 B. + alignment = 1 << 8; + } + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>> GradientAndLoDTensor; @@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() { return grad1.second->data<void>() < grad2.second->data<void>(); }); + size_t size_of_dtype = framework::SizeOfType(dtype); for (size_t k = 1; k < g_tensor.size(); ++k) { const void *cur_address = g_tensor.at(k - 1).second->data<void>(); int64_t len = g_tensor.at(k - 1).second->numel(); - auto offset = len * framework::SizeOfType(dtype); + auto offset = Alignment(len * size_of_dtype, places_[0]); void *infer_next_address = reinterpret_cast<void *>( reinterpret_cast<uintptr_t>(cur_address) + offset); const void *next_address = g_tensor.at(k).second->data<void>(); @@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor, proto::VarType::Type *dtype, int64_t *numel) const { *numel = 0; + size_t size_of_dtype = 0; for (size_t i = 0; i < grad_tensor.size(); ++i) { - // Get element number - int64_t len = grad_tensor.at(i).second->numel(); - PADDLE_ENFORCE_GT(len, 0); - *numel += len; - // Get dtype auto ele_type = grad_tensor.at(i).second->type(); if (i == 0) { *dtype = ele_type; + size_of_dtype = framework::SizeOfType(ele_type); } PADDLE_ENFORCE_EQ(ele_type, *dtype); + + // Get element number + int64_t len = grad_tensor.at(i).second->numel(); + PADDLE_ENFORCE_GT(len, 0); + // Alignment(len) + *numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; } } diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 884089df38..611693fc7c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -20,7 +20,6 @@ #include <unordered_set> #include <utility> #include <vector> - #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph.h" @@ -34,6 +33,10 @@ namespace framework { class Scope; namespace details { +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const override; diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index ab5e099023..6e6ef074db 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -20,7 +20,6 @@ #include <unordered_set> #include <utility> #include <vector> - #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/var_handle.h" @@ -41,22 +40,25 @@ namespace details { // `std::vector<VarHandle*>` is the version of varaibles. typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>> GraphVars; -const char kGraphVars[] = "vars"; - -// aux variables to represent dependency. Useful to resolve data hazard. -typedef std::unordered_set<VarHandleBase *> GraphDepVars; -const char kGraphDepVars[] = "dep_vars"; +constexpr char kGraphVars[] = "vars"; -constexpr char kNCCLCtxs[] = "nccl_ctxs"; - -constexpr char kLossVarName[] = "loss_var_name"; constexpr char kPlaces[] = "places"; constexpr char kLocalScopes[] = "local_scopes"; -constexpr char kStrategy[] = "strategy"; -constexpr char kNRanks[] = "nranks"; +constexpr char kNCCLCtxs[] = "nccl_ctxs"; + +// aux variables to represent dependency. Useful to resolve data hazard. +typedef std::unordered_set<VarHandleBase *> GraphDepVars; +constexpr char kGraphDepVars[] = "dep_vars"; typedef std::unordered_set<std::string> FusedVars; constexpr char kFusedVars[] = "fused_vars"; +constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; + +typedef std::string FusedOptType; +constexpr char kFusedOptType[] = "fused_opt_type"; + +typedef std::string FusedGrads; +constexpr char kFusedGrads[] = "fused_gradients"; typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads; constexpr char kParamsAndGrads[] = "params_grads"; @@ -65,8 +67,6 @@ typedef std::vector<std::vector<std::pair<std::string, std::string>>> GroupGradsAndParams; constexpr char kGroupGradsAndParams[] = "group_grads_params"; -constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index ef096c2b81..ea7f8c496a 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -70,7 +70,7 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) { return *this; } -Tensor Tensor::Slice(int begin_idx, int end_idx) const { +Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const { check_memory_size(); PADDLE_ENFORCE_GE(begin_idx, 0, "The start row index must be greater than 0."); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index a3c1063ce9..0fa76f943e 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -133,7 +133,7 @@ class Tensor { * @param[in] end_idx The index of the end row(exclusive) to slice. * The index number begins from 0. */ - Tensor Slice(int begin_idx, int end_idx) const; + Tensor Slice(int64_t begin_idx, int64_t end_idx) const; platform::Place place() const { PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc index df0e9911cf..d4bdecff62 100644 --- a/paddle/fluid/operators/alloc_continuous_space_op.cc +++ b/paddle/fluid/operators/alloc_continuous_space_op.cc @@ -65,7 +65,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { // Get numel and dtype size_t numel = 0; auto dtype = kDefaultDtype; - GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype); + GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype, + context.GetPlace()); // Alloc the continuous space auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput"); @@ -74,14 +75,18 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { // Init the continuous space auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output"); - int64_t offset = 0; + size_t offset = 0; + size_t size_of_dtype = framework::SizeOfType(dtype); if (context.Attr<bool>("copy_data")) { for (size_t i = 0; i < in_var_names.size(); ++i) { - int64_t len = out_tensors[i]->numel(); - auto sub_tensor = fused_tensor->Slice(offset, offset + len); - offset += len; - framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, + size_t len = static_cast<size_t>(in_tensors[i]->numel()); + auto sub_tensor = fused_tensor->Slice( + static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)); + framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); + + offset += + Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; } } else if (context.Attr<bool>("set_constant")) { math::SetConstant<DeviceContext, T> set_constant; @@ -92,11 +97,13 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { // Make the outputs point to the continuous space. offset = 0; for (size_t i = 0; i < out_tensors.size(); ++i) { - int64_t len = out_tensors[i]->numel(); + size_t len = static_cast<size_t>(out_tensors[i]->numel()); auto dim = out_tensors[i]->dims(); out_tensors[i] - ->ShareDataWith(fused_tensor->Slice(offset, offset + len)) + ->ShareDataWith(fused_tensor->Slice( + static_cast<int64_t>(offset), static_cast<int64_t>(offset + len))) .Resize(dim); + len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; offset += len; VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i] << ") ,dim:(" << dim << ")" @@ -104,12 +111,28 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { } } + private: + // Note(zcd): Addresses should be aligned, otherwise, the results may have + // diff. + size_t Alignment(size_t size, const platform::Place &place) const { + // Allow to allocate the minimum chunk size is 4 KB. + size_t alignment = 1 << 12; + if (platform::is_gpu_place(place)) { + // Allow to allocate the minimum chunk size is 256 B. + alignment = 1 << 8; + } + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); + } + void GetMemSizeAndDtype( const std::vector<const framework::LoDTensor *> &lod_tensors, const std::vector<std::string> var_names, size_t *numel, - framework::proto::VarType::Type *dtype) const { + framework::proto::VarType::Type *dtype, + const platform::Place &place) const { PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); *numel = 0; + size_t size_of_dtype = 0; for (size_t i = 0; i < var_names.size(); ++i) { PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", var_names[i]); @@ -119,6 +142,7 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.", var_names[i], kDefaultDtype); *dtype = p_dtype; + size_of_dtype = framework::SizeOfType(p_dtype); } PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal."); @@ -126,7 +150,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> { PADDLE_ENFORCE_GT(size, 0); VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:(" << lod_tensors[i]->dims() << ")"; - *numel += size; + *numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) / + size_of_dtype; } } }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f62531c7bb..fa978f1c99 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1282,6 +1282,15 @@ All parameter, weight, gradient are variables in Paddle. it will save GPU memory and may make the execution faster. This options is only available in GPU devices. Default False)DOC") + .def_property("fuse_all_optimizer_ops", + [](const BuildStrategy &self) { + return self.fuse_all_optimizer_ops_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), + "BuildStrategy is finlaized."); + self.fuse_all_optimizer_ops_ = b; + }) .def_property( "sync_batch_norm", [](const BuildStrategy &self) { return self.sync_batch_norm_; }, diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 61fd9af127..18ed02a722 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase): use_ir_memory_optimize=True, enable_inplace=True, fuse_elewise_add_act_ops=False, + fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, @@ -81,6 +82,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize + build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops # python memory optimization is conflict with inplace pass. # Use ir graph memory optimization after inplace pass is the correct way. diff --git a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py index 9d5fe114ba..29eb0166b7 100644 --- a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py +++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py @@ -16,8 +16,10 @@ from __future__ import print_function import unittest import numpy as np - from op_test import OpTest +from paddle.fluid import core + +alignment = 256 class TestAllocContinuousSpace(OpTest): @@ -29,11 +31,11 @@ class TestAllocContinuousSpace(OpTest): self.constant = attrs["constant"] self.set_constant = attrs["set_constant"] self.Inputs = self.init_input() - self.FusedOutput = self.init_output(self.Inputs, self.set_constant, - self.constant) + self.Outputs, self.FusedOutput = self.init_output( + self.Inputs, self.set_constant, self.constant) self.inputs = {'Input': self.Inputs} self.attrs = attrs - self.outputs = {'Output': self.Inputs, 'FusedOutput': self.FusedOutput} + self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput} def init_dtype(self): self.dtype = np.float32 @@ -52,14 +54,31 @@ class TestAllocContinuousSpace(OpTest): return {"copy_data": True, "set_constant": False, "constant": 0.0} def init_output(self, input_list, set_constant, constant): - inputs = [input[1].flatten() for input in input_list] - output = np.concatenate(inputs) + inputs = [] + outputs = input_list + + for input in input_list: + length = len(input[1].flatten()) + aligned_len = (length + alignment) / alignment * alignment + out = np.zeros(int(aligned_len)) + out[0:length] = input[1].flatten() + inputs.append(out) + + alloc_continuous_space_var = np.concatenate([input for input in inputs]) if set_constant: - output = np.ones((len(output))) * constant - return output + alloc_continuous_space_var = np.ones( + (len(alloc_continuous_space_var))) * constant + outputs = [(out[0], + np.ones(out[1].shape).astype(self.dtype) * constant) + for out in outputs] + return outputs, alloc_continuous_space_var def test_check_output(self): - self.check_output() + if core.is_compiled_with_cuda(): + self.check_output_with_place( + place=core.CUDAPlace(0), + no_check_set=["FusedOutput"], + atol=1e-5) class TestAllocContinuousSpace2(TestAllocContinuousSpace): @@ -67,7 +86,11 @@ class TestAllocContinuousSpace2(TestAllocContinuousSpace): return {"copy_data": False, "set_constant": True, "constant": 0.5} def test_check_output(self): - self.check_output(no_check_set=["Output"]) + if core.is_compiled_with_cuda(): + self.check_output_with_place( + place=core.CUDAPlace(0), + no_check_set=["FusedOutput"], + atol=1e-5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py new file mode 100644 index 0000000000..93e67deaf3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py @@ -0,0 +1,135 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + + +def simple_fc_net(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(2): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestFuseAdamOps(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _init_data(self, random=True): + np.random.seed(5) + if random: + img = np.random.random(size=[32, 784]).astype(np.float32) + else: + img = np.ones(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_fused_optimizer_ops(self, + model, + use_cuda, + random_data=True, + optimizer=fluid.optimizer.Adam): + if use_cuda and not core.is_compiled_with_cuda(): + return + img, label = self._init_data(random_data) + not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_optimizer_ops=False, + memory_opt=False, # avoid the gradient's name changed in Python side. + optimizer=optimizer) + fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_optimizer_ops=True, + memory_opt=False, # avoid the gradient's name changed in Python side. + optimizer=optimizer) + + for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops(simple_fc_net, True) + self._compare_fused_optimizer_ops(simple_fc_net, False) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops(fc_with_batchnorm, True) + # self._compare_fused_optimizer_ops(fc_with_batchnorm, False) + + +class TestFuseSGDOps(TestFuseAdamOps): + def sgd_optimizer(self, learning_rate=1e-4): + return fluid.optimizer.SGD(learning_rate=learning_rate) + + def test_simple_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + simple_fc_net, True, optimizer=self.sgd_optimizer) + self._compare_fused_optimizer_ops( + simple_fc_net, False, optimizer=self.sgd_optimizer) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + fc_with_batchnorm, True, optimizer=self.sgd_optimizer) + self._compare_fused_optimizer_ops( + fc_with_batchnorm, False, optimizer=self.sgd_optimizer) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index ba63213a41..6671a2def3 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -61,6 +61,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, param_attr=fluid.ParamAttr( name=embedding_name, trainable=False)) for x in word_input ] + # TODO(zcd): if the parameter is not trainable, the + # parameter's gradient should not generated. + for emb_layer in emb_layers: + emb_layer.stop_gradient = True + emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) @@ -113,60 +118,62 @@ class TestCRFModel(unittest.TestCase): os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() - with fluid.program_guard(main, startup): - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - feature_out = db_lstm(**locals()) - target = fluid.layers.data( - name='target', shape=[1], dtype='int64', lod_level=1) - crf_cost = fluid.layers.linear_chain_crf( - input=feature_out, - label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=1e-1)) - avg_cost = fluid.layers.mean(crf_cost) - - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.01, - decay_steps=100000, - decay_rate=0.5, - staircase=True)) - sgd_optimizer.minimize(avg_cost) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), - batch_size=16) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - - train_cp = compiler.CompiledProgram(main).with_data_parallel( - loss_name=avg_cost.name, build_strategy=build_strategy) - - feeder = fluid.DataFeeder( - feed_list=[ - word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, - mark, target - ], - place=fluid.CPUPlace()) + scope = fluid.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(main, startup): + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + + feature_out = db_lstm(**locals()) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) + crf_cost = fluid.layers.linear_chain_crf( + input=feature_out, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=1e-1)) + avg_cost = fluid.layers.mean(crf_cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=100000, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), + batch_size=16) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + + train_cp = compiler.CompiledProgram(main).with_data_parallel( + loss_name=avg_cost.name, build_strategy=build_strategy) + + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, + mark, target + ], + place=fluid.CPUPlace()) data = train_data() for i in range(10): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index 17f8f5a0b4..d0eca7d6df 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -41,14 +41,15 @@ class TestBase(unittest.TestCase): fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()) exe.run(startup_prog) - for _ in six.moves.xrange(iter): - exe_strategy = fluid.ExecutionStrategy() - exe_strategy._dry_run = True - exe_strategy.use_experimental_executor = use_experimental_executor - train_cp = compiler.CompiledProgram(main_prog).with_data_parallel( - loss_name=loss.name, exec_strategy=exe_strategy) - for _ in six.moves.xrange(iter_per_pe): - exe.run(train_cp) + exe_strategy = fluid.ExecutionStrategy() + exe_strategy._dry_run = True + exe_strategy.use_experimental_executor = use_experimental_executor + train_cp = compiler.CompiledProgram( + main_prog).with_data_parallel( + loss_name=loss.name, exec_strategy=exe_strategy) + for _ in six.moves.xrange(iter): + for _ in six.moves.xrange(iter_per_pe): + exe.run(train_cp) class TestMNISTDryRun(TestBase):