Merge pull request #15926 from dzhwinter/test/add_ir_mem_opt_tests

add ir memory optimize test base
6 years ago · 15de2dff00
parent e00c7a2e26 48d9fd08e5
commit 15de2dff00
4 changed files with 207 additions and 3 deletions
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
@ -0,0 +1,150 @@
 # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import six
 import unittest
 import time
 import math
 import multiprocessing
 import numpy as np
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 # open eager delete mode
 os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
 os.environ['FLAGS_fast_eager_deletion_mode'] = 'true'
 os.environ['CPU_NUM'] = '2'
 class BuildIrMemOptBase(unittest.TestCase):
    def check_network_convergence(self,
                                  network,
                                  use_cuda=True,
                                  memory_opt=True,
                                  use_ir_memory_optimize=True,
                                  enable_inplace=True,
                                  iter=5):
        if use_cuda and not core.is_compiled_with_cuda():
            print('Skip use_cuda=True because Paddle is not compiled with cuda')
            return
        if os.name == 'nt':
            print(
                'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
            )
            return
        fluid.default_startup_program().random_seed = 100
        fluid.default_main_program().random_seed = 100
        batch_size = 32
        batch_size *= fluid.core.get_cuda_device_count() if use_cuda else int(
            os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
        # build network
        word_dict = paddle.dataset.imdb.word_dict()
        train_reader = paddle.batch(
            paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
        data = fluid.layers.data(
            name="words", shape=[1], dtype="int64", lod_level=1)
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
        cost = network(data, label, len(word_dict))
        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
        optimizer.minimize(cost)
        if memory_opt:
            fluid.memory_optimize(fluid.default_main_program())
        # execution
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
        reader = feeder.decorate_reader(train_reader, multi_devices=True)
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        train_cp = compiler.CompiledProgram(fluid.default_main_program())
        train_cp = train_cp.with_data_parallel(loss_name=cost.name)
        fetch_list = [cost.name]
        begin = time.time()
        first_loss, last_loss = None, None
        step_id = 0
        custom_iter = getattr(self, "iter", None)
        if not custom_iter == None:
            iter = custom_iter
        for data in reader():
            ret = exe.run(train_cp, feed=data, fetch_list=fetch_list)
            print(ret)
            step_id += 1
            if step_id == 1:
                first_loss = ret[0]
            if step_id == iter:
                last_loss = ret[0]
                break
        end = time.time()
        print("%.4f Instance per second" % (
            (batch_size * iter) / (end - begin)))
        print(first_loss, last_loss)
        avg_last_loss_val = np.array(last_loss).mean()
        avg_first_loss_val = np.array(first_loss).mean()
        if math.isnan(float(avg_last_loss_val)) or math.isnan(
                float(avg_first_loss_val)):
            sys.exit("got NaN loss, training failed.")
        return first_loss, last_loss
 class TestIrMemOptBase(BuildIrMemOptBase):
    def setUp(self):
        self.network = None
    def test_network(self):
        if self.network is None or not core.is_compiled_with_cuda():
            return
        baseline_first_loss, baseline_last_loss = None, None
        for use_cuda in [True]:
            for use_python_mem_opt in [True, False]:
                print(
                    'network: {}, use_cuda: {}, use_python_mem_opt: {}, use_ir_mem_opt : {}'.
                    format(self.network.__name__, use_cuda, use_python_mem_opt,
                           not use_python_mem_opt))
                with fluid.program_guard(fluid.Program(), fluid.Program()):
                    with fluid.scope_guard(core.Scope()):
                        if use_cuda is True and use_python_mem_opt is True:
                            baseline_first_loss, baseline_last_loss = self.check_network_convergence(
                                self.network,
                                use_cuda=use_cuda,
                                memory_opt=use_python_mem_opt)
                        else:
                            cur_first_loss, cur_last_loss = self.check_network_convergence(
                                self.network,
                                use_cuda=use_cuda,
                                memory_opt=use_python_mem_opt)
                            self.assertAlmostEquals(
                                np.mean(baseline_last_loss),
                                np.mean(cur_last_loss),
                                delta=1e-2)
                            self.assertAlmostEquals(
                                np.mean(baseline_first_loss),
                                np.mean(cur_first_loss),
                                delta=1e-2)
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@ -56,6 +56,8 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
        train_reader, multi_devices=use_parallel_executor)
    exe = fluid.Executor(place)
    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    exe.run(fluid.default_startup_program())
    train_cp = compiler.CompiledProgram(fluid.default_main_program())
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
@ -0,0 +1,55 @@
 # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # nlp model stack of op operate on lod. It's a classical test case in optimize pass.
 from __future__ import print_function
 import paddle.fluid as fluid
 import unittest
 from ir_memory_optimize_net_base import TestIrMemOptBase
 def lstm_net(data,
             label,
             dict_dim,
             emb_dim=128,
             hid_dim=128,
             hid_dim2=96,
             class_dim=2,
             emb_lr=30.0):
    emb = fluid.layers.embedding(
        input=data,
        size=[dict_dim, emb_dim],
        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
    lstm_h, c = fluid.layers.dynamic_lstm(
        input=fc0, size=hid_dim * 4, is_reverse=False)
    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
    lstm_max_tanh = fluid.layers.tanh(lstm_max)
    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    return avg_cost
 class TestIrMemOptRNN(TestIrMemOptBase):
    def setUp(self):
        self.network = lstm_net
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@ -28,9 +28,6 @@ os.environ[
 from test_parallel_executor_transformer import transformer, ModelHyperParams, transformer_model, transformer, prepare_batch_input
 from parallel_executor_test_base import TestParallelExecutorBase
 # disable temporarily because of timeout.
 sys.exit(0)
 # NOTE(dzhwinter): test diferent strategy colisions.
 # open the eager delete tensor strategy by default.