From fac8702269b2e91891ffccdd684be9d5f91ff31c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 13 Dec 2018 22:39:40 +0800 Subject: [PATCH 01/17] adam support multithread --- paddle/fluid/framework/operator.cc | 2 ++ paddle/fluid/framework/operator.h | 3 +++ paddle/fluid/operators/optimizers/adam_op.h | 30 ++++++++++++++++++--- python/paddle/fluid/__init__.py | 3 ++- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 66055e6f1d..c4ff97948a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -30,6 +30,8 @@ DECLARE_bool(benchmark); DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); +DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); +DEFINE_int32(min_param_size_to_use_multithread, 0, ""); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bc..175f7975a3 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -34,6 +34,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/variant.h" +DECLARE_int32(inner_op_parallelism); +DECLARE_int32(min_param_size_to_use_multithread); + namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54..aabb71c556 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -352,10 +353,31 @@ class AdamOpKernel : public framework::OpKernel { lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size()); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + int inner_op_parallelism = FLAGS_inner_op_parallelism; + if (inner_op_parallelism > 1 && + FLAGS_min_param_size_to_use_multithread > 0 && + param.numel() > FLAGS_min_param_size_to_use_multithread) { + std::vector> fs; + int64_t block_size = param.numel() / inner_op_parallelism; + for (int i = 0; i < inner_op_parallelism; ++i) { + int64_t start = i * block_size; + int64_t end = (i + 1) * block_size; + if (end > param.numel()) { + end = param.numel(); + } + fs.push_back(framework::Async([&functor, start, end]() { + for (int64_t i = start; i < end; ++i) { + functor(i); + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); + } else { + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else { PADDLE_THROW("Variable type not supported by adam_op"); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e0bb0d1152..1b24e01c22 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -128,7 +128,8 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname' + 'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism', + 'min_param_size_to_use_multithread' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') From 59cf96ec18ed73ae97b91ab233d2270cbb42a905 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 18 Dec 2018 09:33:10 +0800 Subject: [PATCH 02/17] add log --- paddle/fluid/operators/optimizers/adam_op.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index aabb71c556..7dd5a8783a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -357,6 +357,9 @@ class AdamOpKernel : public framework::OpKernel { if (inner_op_parallelism > 1 && FLAGS_min_param_size_to_use_multithread > 0 && param.numel() > FLAGS_min_param_size_to_use_multithread) { + VLOG(3) << "use multi thread, inner_op_parallelism=" + << inner_op_parallelism << " min_param_size_to_use_multithread" + << FLAGS_min_param_size_to_use_multithread; std::vector> fs; int64_t block_size = param.numel() / inner_op_parallelism; for (int i = 0; i < inner_op_parallelism; ++i) { From 8936c7913b7b25a536470ac2a20999b8744cca5f Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 18 Dec 2018 09:58:54 +0800 Subject: [PATCH 03/17] add log test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 7dd5a8783a..5ba5639fd5 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -358,7 +358,7 @@ class AdamOpKernel : public framework::OpKernel { FLAGS_min_param_size_to_use_multithread > 0 && param.numel() > FLAGS_min_param_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" - << inner_op_parallelism << " min_param_size_to_use_multithread" + << inner_op_parallelism << " min_param_size_to_use_multithread=" << FLAGS_min_param_size_to_use_multithread; std::vector> fs; int64_t block_size = param.numel() / inner_op_parallelism; From 0820d369f2a18e5eb5f906f43a5f525245f3fba1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 22 Dec 2018 22:11:28 +0800 Subject: [PATCH 04/17] fix typo test=develop --- python/paddle/fluid/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 4082af438c..745a14af86 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -136,8 +136,7 @@ def __bootstrap__(): 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', - 'inner_op_parallelism' - 'min_param_size_to_use_multithread' + 'inner_op_parallelism', 'min_param_size_to_use_multithread' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') From 1177b0bc84b42fb6608568073ba096bc10d3865e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 27 Dec 2018 10:20:27 +0800 Subject: [PATCH 05/17] update multi thread adam --- paddle/fluid/operators/optimizers/adam_op.h | 32 ++++++++++++--------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index e9fbe15cbe..f8c7b82053 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -465,14 +465,14 @@ class AdamOpKernel : public framework::OpKernel { if (platform::is_cpu_place(ctx.GetPlace())) { SparseAdamFunctor functor( - beta1, beta2, epsilon, beta1_pow.template data(), - beta2_pow.template data(), mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - lr.template data(), grad_data, param.template data(), - param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size(), lazy_mode); + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + grad_merge.rows().size(), lazy_mode); // multi thread speedup if (FLAGS_inner_op_parallelism > 1 && FLAGS_min_param_size_to_use_multithread > 0 && @@ -491,17 +491,20 @@ class AdamOpKernel : public framework::OpKernel { row_id_to_grad_row_offset[grad_rows[i]] = i; } std::vector> fs; - int64_t line_in_each_thread = param_row_count / FLAGS_inner_op_parallelism; + int64_t line_in_each_thread = + param_row_count / FLAGS_inner_op_parallelism; for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { int64_t start = i * line_in_each_thread; int64_t end = (i + 1) * line_in_each_thread; if (end > param_row_count) { end = param_row_count; } - fs.push_back(framework::Async([&functor, &row_id_to_grad_row_offset, start, end]() { - for (int64_t i = start; i < end; ++i) { - functor.update_row(i, row_id_to_grad_row_offset[i]); - }})); + fs.push_back(framework::Async( + [&functor, &row_id_to_grad_row_offset, start, end]() { + for (int64_t i = start; i < end; ++i) { + functor.update_row(i, row_id_to_grad_row_offset[i]); + } + })); } for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } else { @@ -511,7 +514,8 @@ class AdamOpKernel : public framework::OpKernel { for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t offset = 0; offset < row_numel; ++offset) { size_t i = cpu_rows[row_index] * row_numel + offset; - functor.adam_update(i, grad_data[row_index * row_numel + offset]); + functor.adam_update(i, + grad_data[row_index * row_numel + offset]); } } } else { From d0572bf02ede9110719462861d445e104e391715 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 27 Dec 2018 10:46:55 +0800 Subject: [PATCH 06/17] add log for lazy mode test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index f8c7b82053..6b794e0d3e 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -509,6 +509,7 @@ class AdamOpKernel : public framework::OpKernel { for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } else { if (lazy_mode) { + VLOG(3) << "run cpu lazy mode"; size_t row_count = grad_merge.rows().size(); std::vector cpu_rows(grad_merge.rows()); for (size_t row_index = 0; row_index < row_count; ++row_index) { From 7a58ad5c7921f1038f8d2c0436939864ed6c8d67 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 27 Dec 2018 11:23:10 +0800 Subject: [PATCH 07/17] lazy mode have higher priority then multithread test=develop --- paddle/fluid/operators/optimizers/adam_op.h | 32 +++++++++------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 6b794e0d3e..6ff2a2bb6f 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -473,10 +473,19 @@ class AdamOpKernel : public framework::OpKernel { lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); - // multi thread speedup - if (FLAGS_inner_op_parallelism > 1 && - FLAGS_min_param_size_to_use_multithread > 0 && - param.numel() > FLAGS_min_param_size_to_use_multithread) { + if (lazy_mode) { + VLOG(3) << "run cpu lazy mode"; + size_t row_count = grad_merge.rows().size(); + std::vector cpu_rows(grad_merge.rows()); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = cpu_rows[row_index] * row_numel + offset; + functor.adam_update(i, grad_data[row_index * row_numel + offset]); + } + } + } else if (FLAGS_inner_op_parallelism > 1 && + FLAGS_min_param_size_to_use_multithread > 0 && + param.numel() > FLAGS_min_param_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" << FLAGS_inner_op_parallelism << " min_param_size_to_use_multithread=" @@ -508,20 +517,7 @@ class AdamOpKernel : public framework::OpKernel { } for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } else { - if (lazy_mode) { - VLOG(3) << "run cpu lazy mode"; - size_t row_count = grad_merge.rows().size(); - std::vector cpu_rows(grad_merge.rows()); - for (size_t row_index = 0; row_index < row_count; ++row_index) { - for (size_t offset = 0; offset < row_numel; ++offset) { - size_t i = cpu_rows[row_index] * row_numel + offset; - functor.adam_update(i, - grad_data[row_index * row_numel + offset]); - } - } - } else { - functor(param.numel()); - } + functor(param.numel()); } } else if (platform::is_gpu_place(ctx.GetPlace())) { SparseAdamFunctor functor( From d16121533295c04e407c6e25dc0a9aaf3079fe2d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 27 Dec 2018 13:37:29 +0800 Subject: [PATCH 08/17] optimize adam multi thread --- paddle/fluid/operators/optimizers/adam_op.h | 13 ++++++++++++- python/paddle/fluid/tests/unittests/test_adam_op.py | 10 +++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 6ff2a2bb6f..f907522d5a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -490,9 +490,17 @@ class AdamOpKernel : public framework::OpKernel { << FLAGS_inner_op_parallelism << " min_param_size_to_use_multithread=" << FLAGS_min_param_size_to_use_multithread; + PADDLE_ENFORCE_LE( + FLAGS_inner_op_parallelism, 8, + "FLAGS_inner_op_parallelism should not be larger then 8"); auto& grad_rows = grad_merge.rows(); std::unordered_map row_id_to_grad_row_offset; size_t param_row_count = param.numel() / row_numel; + if (param_row_count < 1000) { + LOG(WARNING) << "param_row_count should be larger then 1000 to use " + "multi thread, currently " + << param_row_count; + } for (size_t i = 0; i < param_row_count; ++i) { row_id_to_grad_row_offset[i] = -1; } @@ -501,10 +509,13 @@ class AdamOpKernel : public framework::OpKernel { } std::vector> fs; int64_t line_in_each_thread = - param_row_count / FLAGS_inner_op_parallelism; + param_row_count / FLAGS_inner_op_parallelism + 1; for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { int64_t start = i * line_in_each_thread; int64_t end = (i + 1) * line_in_each_thread; + if (start >= param_row_count) { + break; + } if (end > param_row_count) { end = param_row_count; } diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index ff7fc5100e..463a0655a8 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -253,11 +253,11 @@ class TestSparseAdamOp(unittest.TestCase): row_numel = 12 self.row_numel = row_numel self.dense_inputs = { - "Param": np.full((height, row_numel), 5.0).astype("float32"), - "Moment1": np.full((height, row_numel), 5.0).astype("float32"), - "Moment2": np.full((height, row_numel), 5.0).astype("float32"), - 'Beta1Pow': np.array([beta1**10]).astype("float32"), - 'Beta2Pow': np.array([beta2**10]).astype("float32"), + "Param": np.full((height, row_numel), 1.0).astype("float32"), + "Moment1": np.full((height, row_numel), 1.0).astype("float32"), + "Moment2": np.full((height, row_numel), 1.0).astype("float32"), + 'Beta1Pow': np.array([beta1**3]).astype("float32"), + 'Beta2Pow': np.array([beta2**3]).astype("float32"), "LearningRate": np.full((1), 2.0).astype("float32") } self.init_output = np.full((height, row_numel), 0.0).astype("float32") From 39a400345e76acc2e6fd04940dc64684ed2c19b0 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 27 Dec 2018 14:17:26 +0800 Subject: [PATCH 09/17] add unit test for test_adam_op_multi_thread test=develop --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8..bc3e03b53c 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -86,6 +86,7 @@ list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) +py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4 FLAGS_min_param_size_to_use_multithread=2) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) From 0e747e8d020bba36943824550556260b9bc5d7d3 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 3 Jan 2019 14:45:57 +0800 Subject: [PATCH 10/17] change the limit of thead num --- paddle/fluid/operators/optimizers/adam_op.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index f907522d5a..1f0dbedcfb 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -490,9 +490,10 @@ class AdamOpKernel : public framework::OpKernel { << FLAGS_inner_op_parallelism << " min_param_size_to_use_multithread=" << FLAGS_min_param_size_to_use_multithread; - PADDLE_ENFORCE_LE( - FLAGS_inner_op_parallelism, 8, - "FLAGS_inner_op_parallelism should not be larger then 8"); + if (FLAGS_inner_op_parallelism > 10) { + LOG(WARNING) << "FLAGS_inner_op_parallelism " + << FLAGS_inner_op_parallelism << " is two large!"; + } auto& grad_rows = grad_merge.rows(); std::unordered_map row_id_to_grad_row_offset; size_t param_row_count = param.numel() / row_numel; From c15270c5b20d31bff04bd66bbc8f37f188213d72 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 6 Jan 2019 15:50:26 +0800 Subject: [PATCH 11/17] optimize multi thread adam --- paddle/fluid/operators/optimizers/adam_op.h | 32 ++++++++++++--------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 1f0dbedcfb..b84d63f51a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -305,13 +305,6 @@ struct SparseAdamFunctor { param_out_[i] = p; } - inline void update_row(size_t row_id, int grad_row_offset) const { - for (size_t i = 0U; i < row_numel_; ++i) { - T g = grad_row_offset >= 0 ? grad_[grad_row_offset * row_numel_ + i] : 0; - adam_update(row_id * row_numel_ + i, g); - } - } - inline void operator()(size_t numel) const { // lr could be reuse T lr = *lr_; @@ -502,9 +495,6 @@ class AdamOpKernel : public framework::OpKernel { "multi thread, currently " << param_row_count; } - for (size_t i = 0; i < param_row_count; ++i) { - row_id_to_grad_row_offset[i] = -1; - } for (size_t i = 0; i < grad_rows.size(); ++i) { row_id_to_grad_row_offset[grad_rows[i]] = i; } @@ -520,10 +510,24 @@ class AdamOpKernel : public framework::OpKernel { if (end > param_row_count) { end = param_row_count; } - fs.push_back(framework::Async( - [&functor, &row_id_to_grad_row_offset, start, end]() { - for (int64_t i = start; i < end; ++i) { - functor.update_row(i, row_id_to_grad_row_offset[i]); + fs.push_back( + framework::Async([&functor, &row_id_to_grad_row_offset, + &grad_data, row_numel, start, end]() { + for (int64_t row_id = start; row_id < end; ++row_id) { + auto iter = row_id_to_grad_row_offset.find(row_id); + if (iter != row_id_to_grad_row_offset.end()) { + for (size_t row_offset = 0U; row_offset < row_numel; + ++row_offset) { + functor.adam_update( + row_id * row_numel + row_offset, + grad_data[iter->second * row_numel + row_offset]); + } + } else { + for (size_t row_offset = 0U; row_offset < row_numel; + ++row_offset) { + functor.adam_update(row_id * row_numel + row_offset, 0); + } + } } })); } From 87b4eb1da497c1ac4cc1a3d50a1f317b839c954d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 7 Jan 2019 17:13:47 +0800 Subject: [PATCH 12/17] change min_param_size_to_use_multithread to min_row_size_to_use_multithread --- paddle/fluid/framework/operator.cc | 2 +- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/operators/optimizers/adam_op.h | 8 ++++---- python/paddle/fluid/__init__.py | 2 +- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 4c4fb03c22..9cb2b5ee71 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -30,7 +30,7 @@ DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); -DEFINE_int32(min_param_size_to_use_multithread, 0, ""); +DEFINE_int32(min_row_size_to_use_multithread, 0, ""); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index eea3db6577..2962dff122 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -35,7 +35,7 @@ limitations under the License. */ #include "paddle/fluid/platform/variant.h" DECLARE_int32(inner_op_parallelism); -DECLARE_int32(min_param_size_to_use_multithread); +DECLARE_int32(min_row_size_to_use_multithread); namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index e69ede6239..9cd7906877 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -478,12 +478,12 @@ class AdamOpKernel : public framework::OpKernel { } } } else if (FLAGS_inner_op_parallelism > 1 && - FLAGS_min_param_size_to_use_multithread > 0 && - param.numel() > FLAGS_min_param_size_to_use_multithread) { + FLAGS_min_row_size_to_use_multithread > 0 && + param.dims()[0] > FLAGS_min_row_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" << FLAGS_inner_op_parallelism - << " min_param_size_to_use_multithread=" - << FLAGS_min_param_size_to_use_multithread; + << " min_row_size_to_use_multithread=" + << FLAGS_min_row_size_to_use_multithread; if (FLAGS_inner_op_parallelism > 10) { LOG(WARNING) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism << " is two large!"; diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 691b49130b..b577dfc3e1 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -129,7 +129,7 @@ def __bootstrap__(): 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', - 'inner_op_parallelism', 'min_param_size_to_use_multithread', + 'inner_op_parallelism', 'min_row_size_to_use_multithread', 'enable_parallel_graph' ] if 'Darwin' not in sysstr: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 79edc92055..ac092e19b4 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -87,7 +87,7 @@ list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) -py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4 FLAGS_min_param_size_to_use_multithread=2) +py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4 FLAGS_min_row_size_to_use_multithread=2) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) From 44b300556dcdf26aa159bc31107355e8b3853d86 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 7 Jan 2019 17:34:52 +0800 Subject: [PATCH 13/17] change min_row_size_to_use_multithread to parameter of adam test=develop --- paddle/fluid/framework/operator.cc | 1 - paddle/fluid/framework/operator.h | 1 - paddle/fluid/operators/optimizers/adam_op.cc | 7 +++++++ paddle/fluid/operators/optimizers/adam_op.h | 8 +++++--- python/paddle/fluid/__init__.py | 3 +-- python/paddle/fluid/optimizer.py | 10 ++++++++-- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- python/paddle/fluid/tests/unittests/test_adam_op.py | 7 ++++++- 8 files changed, 28 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9cb2b5ee71..afece8e3d2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -30,7 +30,6 @@ DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); -DEFINE_int32(min_row_size_to_use_multithread, 0, ""); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2962dff122..dd672c4795 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -35,7 +35,6 @@ limitations under the License. */ #include "paddle/fluid/platform/variant.h" DECLARE_int32(inner_op_parallelism); -DECLARE_int32(min_row_size_to_use_multithread); namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index e9c395a931..955f9f455f 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -114,6 +114,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) " "only update the parameter that has gradient in sparse update") .SetDefault(false); + AddAttr("min_row_size_to_use_multithread", + "(int64_t, default 0) " + "when not zero, if param row size is larger then " + "min_row_size_to_use_multithread and " + "inner_op_parallelism is larger then 0, sparse update " + "will run in multithread mode") + .SetDefault(0); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 9cd7906877..2c16a02f6a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -354,6 +354,8 @@ class AdamOpKernel : public framework::OpKernel { using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; + int64_t min_row_size_to_use_multithread = + ctx.Attr("min_row_size_to_use_multithread"); bool lazy_mode = ctx.Attr("lazy_mode"); T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); @@ -478,12 +480,12 @@ class AdamOpKernel : public framework::OpKernel { } } } else if (FLAGS_inner_op_parallelism > 1 && - FLAGS_min_row_size_to_use_multithread > 0 && - param.dims()[0] > FLAGS_min_row_size_to_use_multithread) { + min_row_size_to_use_multithread > 0 && + param.dims()[0] > min_row_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" << FLAGS_inner_op_parallelism << " min_row_size_to_use_multithread=" - << FLAGS_min_row_size_to_use_multithread; + << min_row_size_to_use_multithread; if (FLAGS_inner_op_parallelism > 10) { LOG(WARNING) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism << " is two large!"; diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index b577dfc3e1..812694d99a 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -129,8 +129,7 @@ def __bootstrap__(): 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', - 'inner_op_parallelism', 'min_row_size_to_use_multithread', - 'enable_parallel_graph' + 'inner_op_parallelism', 'enable_parallel_graph' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 779cb5f961..64d7fd0822 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -674,6 +674,8 @@ class AdamOptimizer(Optimizer): may be very slow. The lazy mode only update the element that has gradient is the current mini-batch, so it will be much more faster. But this mode has different semantics with the original Adam algorithm and may lead to different result. + min_row_size_to_use_multithread: if adam use sparse update and the param rows is very large, + you can use FLAGS_inner_op_parallelism and this flag to enable multi thread optimize. Examples: .. code-block:: python @@ -694,7 +696,8 @@ class AdamOptimizer(Optimizer): epsilon=1e-8, regularization=None, name=None, - lazy_mode=False): + lazy_mode=False, + min_row_size_to_use_multithread=0): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -708,6 +711,7 @@ class AdamOptimizer(Optimizer): self._beta2 = beta2 self._epsilon = epsilon self._lazy_mode = lazy_mode + self._min_row_size_to_use_multithread = min_row_size_to_use_multithread def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -762,7 +766,9 @@ class AdamOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon, - "lazy_mode": self._lazy_mode + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": + self._min_row_size_to_use_multithread }, stop_gradient=True) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index ac092e19b4..4f7111df44 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -87,7 +87,7 @@ list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) -py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4 FLAGS_min_row_size_to_use_multithread=2) +py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 463a0655a8..2f4fc57724 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -261,7 +261,12 @@ class TestSparseAdamOp(unittest.TestCase): "LearningRate": np.full((1), 2.0).astype("float32") } self.init_output = np.full((height, row_numel), 0.0).astype("float32") - self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + 'min_row_size_to_use_multithread': 2 + } grad_selected_rows = scope.var('Grad').get_selected_rows() grad_selected_rows.set_height(height) From 4d169ad9818d8a8ed3681e4fab9733fc40a77e8c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 8 Jan 2019 15:17:21 +0800 Subject: [PATCH 14/17] update api spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9872631553..6b92ccf1f0 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -410,7 +410,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode', 'min_row_size_to_use_multithread'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False, 0)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) From 8c516a24e5d670dea5982bdfb6a07a79c03cd31d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 15 Jan 2019 09:56:40 +0800 Subject: [PATCH 15/17] remote min_row_size_to_use_multithread in adam interface test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/optimizers/adam_op.cc | 2 +- paddle/fluid/operators/optimizers/adam_op.h | 10 +++++----- python/paddle/fluid/optimizer.py | 10 ++-------- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index aec60166a1..50ffef72ba 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -418,7 +418,7 @@ paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode', 'min_row_size_to_use_multithread'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False, 0)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 955f9f455f..54e0f5146d 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -120,7 +120,7 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "min_row_size_to_use_multithread and " "inner_op_parallelism is larger then 0, sparse update " "will run in multithread mode") - .SetDefault(0); + .SetDefault(1000); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index f3c9be63d1..db44cd6ec9 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -494,16 +494,16 @@ class AdamOpKernel : public framework::OpKernel { << " min_row_size_to_use_multithread=" << min_row_size_to_use_multithread; if (FLAGS_inner_op_parallelism > 10) { - LOG(WARNING) << "FLAGS_inner_op_parallelism " - << FLAGS_inner_op_parallelism << " is two large!"; + VLOG(1) << "FLAGS_inner_op_parallelism " + << FLAGS_inner_op_parallelism << " is two large!"; } auto& grad_rows = grad_merge.rows(); std::unordered_map row_id_to_grad_row_offset; size_t param_row_count = param.numel() / row_numel; if (param_row_count < 1000) { - LOG(WARNING) << "param_row_count should be larger then 1000 to use " - "multi thread, currently " - << param_row_count; + VLOG(1) << "param_row_count should be larger then 1000 to use " + "multi thread, currently " + << param_row_count; } for (size_t i = 0; i < grad_rows.size(); ++i) { row_id_to_grad_row_offset[grad_rows[i]] = i; diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 906d64ffdd..f01a0eda9a 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -734,8 +734,6 @@ class AdamOptimizer(Optimizer): may be very slow. The lazy mode only update the element that has gradient is the current mini-batch, so it will be much more faster. But this mode has different semantics with the original Adam algorithm and may lead to different result. - min_row_size_to_use_multithread: if adam use sparse update and the param rows is very large, - you can use FLAGS_inner_op_parallelism and this flag to enable multi thread optimize. Examples: .. code-block:: python @@ -756,8 +754,7 @@ class AdamOptimizer(Optimizer): epsilon=1e-8, regularization=None, name=None, - lazy_mode=False, - min_row_size_to_use_multithread=0): + lazy_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -771,7 +768,6 @@ class AdamOptimizer(Optimizer): self._beta2 = beta2 self._epsilon = epsilon self._lazy_mode = lazy_mode - self._min_row_size_to_use_multithread = min_row_size_to_use_multithread def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -826,9 +822,7 @@ class AdamOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon, - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": - self._min_row_size_to_use_multithread + "lazy_mode": self._lazy_mode }, stop_gradient=True) From a6b3bf606925b7a124b56f282f74619e7362bc1a Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 15 Jan 2019 10:07:40 +0800 Subject: [PATCH 16/17] add attr min_row_size_to_use_multithread in op config test=develop --- python/paddle/fluid/optimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index f01a0eda9a..b72b900d3b 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -822,7 +822,8 @@ class AdamOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon, - "lazy_mode": self._lazy_mode + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000 }, stop_gradient=True) From a2f2cde0314f698a935dcbaa3d038cfc2bfc6355 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 15 Jan 2019 10:28:09 +0800 Subject: [PATCH 17/17] revert test_adam_op test=develop --- python/paddle/fluid/tests/unittests/test_adam_op.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 2f4fc57724..15f277cdc0 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -253,11 +253,11 @@ class TestSparseAdamOp(unittest.TestCase): row_numel = 12 self.row_numel = row_numel self.dense_inputs = { - "Param": np.full((height, row_numel), 1.0).astype("float32"), - "Moment1": np.full((height, row_numel), 1.0).astype("float32"), - "Moment2": np.full((height, row_numel), 1.0).astype("float32"), - 'Beta1Pow': np.array([beta1**3]).astype("float32"), - 'Beta2Pow': np.array([beta2**3]).astype("float32"), + "Param": np.full((height, row_numel), 5.0).astype("float32"), + "Moment1": np.full((height, row_numel), 5.0).astype("float32"), + "Moment2": np.full((height, row_numel), 5.0).astype("float32"), + 'Beta1Pow': np.array([beta1**10]).astype("float32"), + 'Beta2Pow': np.array([beta2**10]).astype("float32"), "LearningRate": np.full((1), 2.0).astype("float32") } self.init_output = np.full((height, row_numel), 0.0).astype("float32")