make transpiler test reliable (#11848)

* make transpiler test reliable * add more * follow comments
7 years ago · adfaf9a665
parent 58560622bc
commit adfaf9a665
4 changed files with 235 additions and 184 deletions
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@ -1,80 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-
-from transpiler_test import TranspilerTest
-
-
-class TestSimpleDistTranspiler(TranspilerTest):
-    def setUp(self):
-        self.current_pserver_ep = "127.0.0.1:6175"
-
-    def test_simple_transpiler(self):
-        np.random.seed(1)
-
-        trainer = self.get_trainer()
-        pserver, startup = self.get_pserver(self.current_pserver_ep)
-        self.assertEqual([op.type for op in trainer.global_block().ops],
-                         self.get_expect_trainer_ops())
-
-        self.assertEqual(len(pserver.blocks), 2)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        # block1: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "sgd"])
-
-        # confirm startup program
-        self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "uniform_random", "uniform_random"])
-
-        # the variable #fc_w will NOT be splited
-        fc_w_var = startup.global_block().var("fc_w@GRAD")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-    def get_expect_trainer_ops(self):
-        trainer = fluid.Program()
-
-        with fluid.program_guard(trainer):
-            optimize_ops, params_grads = self.net_conf()
-
-        delete_ops(trainer.global_block(), optimize_ops)
-        ops = [op.type for op in trainer.global_block().ops] + [
-            "send", "send_barrier", "recv", "recv", "fetch_barrier"
-        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
-        return ops
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers,
-            slice_var_up=False)
-        return t
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/transpiler_test.py
+++ b/python/paddle/fluid/tests/unittests/transpiler_test.py
@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-
-
-class TranspilerTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
-
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers)
-        return t
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@ -455,6 +455,8 @@ class DistributeTranspiler(object):
                    __append_optimize_op__(op, per_opt_block, grad_to_block_id,
                                           merged_var, lr_ops)

+        # dedup grad to ids list
+        grad_to_block_id = list(set(grad_to_block_id))
        # append global ops
        if global_ops:
            opt_state_block = pserver_program.create_block(
@ -960,8 +962,6 @@ class DistributeTranspiler(object):
            if not block_map.has_key(varname):
                block_map[varname] = []
            block_map[varname].append((long(offset), long(size)))
-        # Do not remove this important debug message:
-        print("block map: %s" % block_map)

        for varname, splited in block_map.iteritems():
            orig_var = program.global_block().var(varname)
@ -1401,6 +1401,16 @@ class DistributeTranspiler(object):
                    break
        return lr_ops

+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
    def _get_optimize_pass(self):
        """
        Get optimizer operators, paramters and gradients from origin_program
@ -1413,10 +1423,7 @@ class DistributeTranspiler(object):
        params_grads = []
        origin_var_dict = self.origin_program.global_block().vars
        for op in block.ops:
-            # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
-            # or not, because all ops in optimizer sub-graph would
-            # sign the optimizer op role
-            if self._is_optimizer_op(op):
+            if self._is_opt_role_op(op):
                opt_ops.append(op)
                # HACK(wuyi): if we find grad vars from input of optimize
                # ops, we may get the output of clip op. Use syntax "@GRAD"