Polish backward.py to prune more ops (#22246)

* polish backward prune, test=develop * fix control flow op bug, test=develop * add some unittests, test=develop * fix unittest args, test=develop * follow huihuang's comments, test=develop
6 years ago · 039bb5050c
parent 7b2c0993f0
commit 039bb5050c
3 changed files with 268 additions and 0 deletions
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@ -933,6 +933,31 @@ def _append_backward_ops_(block,
                cb(block=target_block, context=grad_to_var)
 def _is_grad_var_(var_name):
    return core.grad_var_suffix() in var_name
 # Find the op who holds the sub_block as its "sub_block" attr
 def _find_parent_op_(sub_block):
    sub_block_id = sub_block.idx
    if sub_block_id == 0:
        return None
    program = sub_block.program
    for block_id in six.moves.range(program.num_blocks):
        block_desc = program.block(block_id).desc
        for op_idx in six.moves.range(block_desc.op_size()):
            op = block_desc.op(op_idx)
            if op.has_attr("sub_block") and op._block_attr_id(
                    "sub_block") == sub_block_id:
                return op
    # NOTE(paddle-dev): When optimizer is added in conditional block, 
    # sub_block may not be found.
    return None
 def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
    """
    Create new variables required by backward pass.
@ -948,11 +973,73 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
            key(str): forward variable name
            val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
    """
    ops_to_remove = []
    '''
    NOTE(paddle-dev): while_grad op may hold some inputs which are not found 
    in the parent/forward block, and they are also the outputs of while_grad 
    op. These kinds of inputs are the recursive outputs inside while_grad op. 
    They should be considered as "already created" when scanning the inner 
    ops of while_grad ops.  
    '''
    parent_op = _find_parent_op_(block)
    parent_op_vars = []
    if parent_op is not None:
        input_args = parent_op.input_arg_names()
        output_args = parent_op.output_arg_names()
        for in_arg in input_args:
            if in_arg in output_args:
                parent_op_vars.append(in_arg)
    for op_idx in range(start_op_idx, block.desc.op_size()):
        op_desc = block.desc.op(op_idx)
        if op_desc.has_attr("sub_block"):
            sub_block = block.program.block(op_desc._block_attr_id("sub_block"))
            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
        grad_var_ins = [
            var for var in op_desc.input_arg_names() if _is_grad_var_(var)
        ]
        grad_var_outs = [
            var for var in op_desc.output_arg_names() if _is_grad_var_(var)
        ]
        inputs = [
            var for var in op_desc.input_arg_names()
            if var != core.empty_var_name()
        ]
        outputs = [
            var for var in op_desc.output_arg_names()
            if var != core.empty_var_name()
        ]
        # If the outputs of grad op is empty, just remove it 
        if not outputs:
            ops_to_remove.append(op_idx)
            continue
        else:
            '''
            If the output is not empty and there is any grad input, find 
            whether there is any existing input. If not, just remove it.
            '''
            if grad_var_ins:
                existing_grad_var_ins = [
                    var for var in grad_var_ins
                    if block.desc.has_var_recursive(cpt.to_bytes(var)) or var in
                    parent_op_vars
                ]
                if not existing_grad_var_ins:
                    '''
                    FIXME(paddle-dev, zengjinle): rnn_memory_helper_grad is used
                    in recurrent op. The input of this op does not even exist in 
                    the program! Therefore, any dependency analysis would not 
                    work to this op! If I do not add the following code, this op
                    would be pruned, and the calculation result would be wrong. 
                    Maybe we should re-design this op later...  
                    '''
                    if op_desc.type() not in ['rnn_memory_helper_grad']:
                        ops_to_remove.append(op_idx)
                    continue
        new_vars = set()
        # create new gradient variables
        for grad_var_name in op_desc.output_arg_names():
@ -972,6 +1059,9 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
            if arg in new_vars:
                _infer_var_data_type_shape_(arg, block)
    for op_idx in reversed(ops_to_remove):
        block.desc._remove_op(op_idx, op_idx + 1)
 def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
    var_map = copy.copy(target_grad_map)
--- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
@ -0,0 +1,91 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import unittest
 def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    np.random.seed(2)
    x = layers.assign(
        np.random.rand(batch_size, beam_size, 32).astype("float32"))
    indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices")
    step_idx = layers.fill_constant(
        shape=[1], dtype="int64", value=0, force_cpu=True)
    max_len = layers.fill_constant(
        shape=[1], dtype="int64", value=10, force_cpu=True)
    cond = layers.less_than(x=step_idx, y=max_len)
    while_op = layers.While(cond)
    scores = layers.array_write(x, step_idx)
    with while_op.block():
        bs = layers.cast(layers.shape(x)[0], "int64")
        for _ in range(20):
            bs = layers.cast(bs, 'int64')
        bs.stop_gradient = stop_gradient
        batch_pos = layers.expand(
            layers.unsqueeze(
                layers.range(
                    0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size])
        topk_coordinates = layers.stack([batch_pos, indices], axis=2)
        topk_coordinates.stop_gradient = stop_gradient
        score = layers.gather_nd(x, topk_coordinates)
        layers.increment(x=step_idx, value=1.0, in_place=True)
        layers.array_write(score, i=step_idx, array=scores)
        length_cond = layers.less_than(x=step_idx, y=max_len)
        layers.assign(length_cond, cond)
    out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0]
    loss = layers.reduce_mean(out)
    opt = fluid.optimizer.Adam(0.01)
    opt.minimize(loss)
    exe = fluid.Executor(place)
    data = np.random.random_integers(
        low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64")
    loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss])
    return loss_val
 class TestDynRNNStopGradient(unittest.TestCase):
    def setUp(self):
        self.batch_size = 20
        self.beam_size = 64
    def run_main(self, place):
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            with fluid.scope_guard(fluid.Scope()):
                value1 = build_and_run_program(place, self.batch_size,
                                               self.beam_size, False)
                value2 = build_and_run_program(place, self.batch_size,
                                               self.beam_size, True)
                self.assertTrue(np.array_equal(value1, value2))
    def test_check_main(self):
        places = [fluid.CPUPlace()]
        if fluid.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
            self.run_main(p)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
@ -0,0 +1,87 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import paddle.fluid as fluid
 import six
 import unittest
 class TestEmbeddingIdStopGradientBase(unittest.TestCase):
    def setUp(self):
        self.reshape_times = 1
        self.iteration = 10
    def get_places(self):
        places = [fluid.CPUPlace()]
        if fluid.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        return places
    def test_check_grad(self):
        for p in self.get_places():
            grad_value1 = self.run_program(p, stop_gradient=False)
            grad_value2 = self.run_program(p, stop_gradient=True)
            self.assertTrue(np.array_equal(grad_value1, grad_value2))
    def run_program(self, place, stop_gradient=False):
        startup_program = fluid.Program()
        main_program = fluid.Program()
        np.random.seed(1)
        startup_program.random_seed = 1
        main_program.random_seed = 1
        scope = fluid.Scope()
        with fluid.program_guard(main_program, startup_program):
            with fluid.scope_guard(scope):
                x_1 = fluid.data(name='x1', shape=[4, 1], dtype='int64')
                x_2 = fluid.data(name='x2', shape=[4, 1], dtype='int64')
                x = fluid.layers.concat([x_1, x_2], axis=-1)
                for _ in six.moves.range(self.reshape_times):
                    x = fluid.layers.reshape(x, [-1, 1])
                x.stop_gradient = stop_gradient
                emb = fluid.embedding(x, size=[10, 32], dtype='float32')
                avg_cost = fluid.layers.mean(emb, name='mean_loss')
                optim = fluid.optimizer.SGD(learning_rate=0.001)
                optim.minimize(avg_cost)
                exe = fluid.Executor(place)
                exe.run(startup_program)
                x1_data = np.random.randint(0, 9, x_1.shape).astype('int64')
                x2_data = np.random.randint(0, 9, x_2.shape).astype('int64')
                fetch_val = None
                for _ in six.moves.range(self.iteration):
                    fetch_val = exe.run(
                        feed={x_1.name: x1_data,
                              x_2.name: x2_data},
                        fetch_list=[emb])[0]
                return fetch_val
 class TestEmbeddingIdStopGradient2(TestEmbeddingIdStopGradientBase):
    def setUp(self):
        self.reshape_times = 100
        self.iteration = 10
 if __name__ == '__main__':
    unittest.main()