From 9fe938cb2aefcbced1e60fa459c943fa2ea245e6 Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 03:48:26 +0000 Subject: [PATCH 1/8] Changing network configuration, avoid nan --- .../fluid/tests/book/test_label_semantic_roles.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index c0a6df831a..5fc64ea958 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -77,7 +77,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, emb_layers.append(mark_embedding) hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) @@ -94,8 +94,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim) + fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') ]) lstm = fluid.layers.dynamic_lstm( @@ -109,8 +109,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, input_tmp = [mix_hidden, lstm] feature_out = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=label_dict_len), - fluid.layers.fc(input=input_tmp[1], size=label_dict_len) + fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') ]) return feature_out @@ -171,7 +171,7 @@ def train(use_cuda, save_dirname=None, is_local=True): # check other optimizers and check why out will be NAN sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( - learning_rate=0.0001, + learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True)) From d9a52223852a92d532ff2522cb648758511abe26 Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 04:57:30 +0000 Subject: [PATCH 2/8] code style --- .../tests/book/test_label_semantic_roles.py | 67 ++++++++++--------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index 5fc64ea958..4f5d30ac00 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -70,14 +70,15 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, fluid.layers.embedding( size=[word_dict_len, word_dim], input=x, - param_attr=fluid.ParamAttr( - name=embedding_name, trainable=False)) for x in word_input + param_attr=fluid.ParamAttr(name=embedding_name, trainable=False)) + for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') + for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) @@ -163,8 +164,7 @@ def train(use_cuda, save_dirname=None, is_local=True): crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=mix_hidden_lr)) + param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) # TODO(qiao) @@ -189,8 +189,7 @@ def train(use_cuda, save_dirname=None, is_local=True): num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), + paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() @@ -223,24 +222,25 @@ def train(use_cuda, save_dirname=None, is_local=True): exe) if batch_id % 10 == 0: - print("avg_cost:" + str(cost) + " precision:" + str( - precision) + " recall:" + str(recall) + " f1_score:" + - str(f1_score) + " pass_precision:" + str( - pass_precision) + " pass_recall:" + str( - pass_recall) + " pass_f1_score:" + str( - pass_f1_score)) + print( + "avg_cost:" + str(cost) + " precision:" + + str(precision) + " recall:" + str(recall) + + " f1_score:" + str(f1_score) + " pass_precision:" + str( + pass_precision) + " pass_recall:" + str(pass_recall) + + " pass_f1_score:" + str(pass_f1_score)) if batch_id != 0: - print("second per batch: " + str((time.time( - ) - start_time) / batch_id)) + print("second per batch: " + str( + (time.time() - start_time) / batch_id)) # Set the threshold low to speed up the CI test if float(pass_precision) > 0.05: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode - fluid.io.save_inference_model(save_dirname, [ - 'word_data', 'verb_data', 'ctx_n2_data', - 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', - 'ctx_p2_data', 'mark_data' - ], [feature_out], exe) + fluid.io.save_inference_model( + save_dirname, [ + 'word_data', 'verb_data', 'ctx_n2_data', + 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', + 'ctx_p2_data', 'mark_data' + ], [feature_out], exe) return batch_id = batch_id + 1 @@ -320,19 +320,20 @@ def infer(use_cuda, save_dirname=None): assert feed_target_names[6] == 'ctx_p2_data' assert feed_target_names[7] == 'mark_data' - results = exe.run(inference_program, - feed={ - feed_target_names[0]: word, - feed_target_names[1]: pred, - feed_target_names[2]: ctx_n2, - feed_target_names[3]: ctx_n1, - feed_target_names[4]: ctx_0, - feed_target_names[5]: ctx_p1, - feed_target_names[6]: ctx_p2, - feed_target_names[7]: mark - }, - fetch_list=fetch_targets, - return_numpy=False) + results = exe.run( + inference_program, + feed={ + feed_target_names[0]: word, + feed_target_names[1]: pred, + feed_target_names[2]: ctx_n2, + feed_target_names[3]: ctx_n1, + feed_target_names[4]: ctx_0, + feed_target_names[5]: ctx_p1, + feed_target_names[6]: ctx_p2, + feed_target_names[7]: mark + }, + fetch_list=fetch_targets, + return_numpy=False) print(results[0].lod()) np_data = np.array(results[0]) print("Inference Shape: ", np_data.shape) From 7c1434dd73d367932e98ae569093183d33b7e5fb Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 07:36:15 +0000 Subject: [PATCH 3/8] code style --- .../tests/book/test_label_semantic_roles.py | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index 4f5d30ac00..ace2e39ba4 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -70,8 +70,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, fluid.layers.embedding( size=[word_dict_len, word_dim], input=x, - param_attr=fluid.ParamAttr(name=embedding_name, trainable=False)) - for x in word_input + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) @@ -164,7 +164,8 @@ def train(use_cuda, save_dirname=None, is_local=True): crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, - param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr)) + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) # TODO(qiao) @@ -189,7 +190,8 @@ def train(use_cuda, save_dirname=None, is_local=True): num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) train_data = paddle.batch( - paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192), + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() @@ -222,25 +224,24 @@ def train(use_cuda, save_dirname=None, is_local=True): exe) if batch_id % 10 == 0: - print( - "avg_cost:" + str(cost) + " precision:" + - str(precision) + " recall:" + str(recall) + - " f1_score:" + str(f1_score) + " pass_precision:" + str( - pass_precision) + " pass_recall:" + str(pass_recall) - + " pass_f1_score:" + str(pass_f1_score)) + print("avg_cost:" + str(cost) + " precision:" + str( + precision) + " recall:" + str(recall) + " f1_score:" + + str(f1_score) + " pass_precision:" + str( + pass_precision) + " pass_recall:" + str( + pass_recall) + " pass_f1_score:" + str( + pass_f1_score)) if batch_id != 0: - print("second per batch: " + str( - (time.time() - start_time) / batch_id)) + print("second per batch: " + str((time.time( + ) - start_time) / batch_id)) # Set the threshold low to speed up the CI test if float(pass_precision) > 0.05: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode - fluid.io.save_inference_model( - save_dirname, [ - 'word_data', 'verb_data', 'ctx_n2_data', - 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', - 'ctx_p2_data', 'mark_data' - ], [feature_out], exe) + fluid.io.save_inference_model(save_dirname, [ + 'word_data', 'verb_data', 'ctx_n2_data', + 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', + 'ctx_p2_data', 'mark_data' + ], [feature_out], exe) return batch_id = batch_id + 1 @@ -320,20 +321,19 @@ def infer(use_cuda, save_dirname=None): assert feed_target_names[6] == 'ctx_p2_data' assert feed_target_names[7] == 'mark_data' - results = exe.run( - inference_program, - feed={ - feed_target_names[0]: word, - feed_target_names[1]: pred, - feed_target_names[2]: ctx_n2, - feed_target_names[3]: ctx_n1, - feed_target_names[4]: ctx_0, - feed_target_names[5]: ctx_p1, - feed_target_names[6]: ctx_p2, - feed_target_names[7]: mark - }, - fetch_list=fetch_targets, - return_numpy=False) + results = exe.run(inference_program, + feed={ + feed_target_names[0]: word, + feed_target_names[1]: pred, + feed_target_names[2]: ctx_n2, + feed_target_names[3]: ctx_n1, + feed_target_names[4]: ctx_0, + feed_target_names[5]: ctx_p1, + feed_target_names[6]: ctx_p2, + feed_target_names[7]: mark + }, + fetch_list=fetch_targets, + return_numpy=False) print(results[0].lod()) np_data = np.array(results[0]) print("Inference Shape: ", np_data.shape) From ad6ddf533cfb1542283f741cddb78835fb3b8658 Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 09:23:11 +0000 Subject: [PATCH 4/8] for ci --- python/paddle/fluid/tests/book/test_label_semantic_roles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index ace2e39ba4..4d8bca4d24 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -37,7 +37,7 @@ depth = 8 mix_hidden_lr = 1e-3 IS_SPARSE = True -PASS_NUM = 10 +PASS_NUM = 100 BATCH_SIZE = 10 embedding_name = 'emb' @@ -234,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True): print("second per batch: " + str((time.time( ) - start_time) / batch_id)) # Set the threshold low to speed up the CI test - if float(pass_precision) > 0.05: + if float(pass_precision) > 0.01: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode fluid.io.save_inference_model(save_dirname, [ From 1204d9f3d1b76de8d3fce594634134bcfb653c8e Mon Sep 17 00:00:00 2001 From: Dang Qingqing Date: Thu, 12 Apr 2018 13:12:05 +0800 Subject: [PATCH 5/8] Refine batch_norm_op. --- paddle/fluid/operators/batch_norm_op.cu.cc | 27 ++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index eecb58e11e..cb1927bc0f 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -114,23 +114,11 @@ class BatchNormKernel const auto *bias = ctx.Input("Bias"); auto *y = ctx.Output("Y"); - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); // alloc memory y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); - math::SetConstant> - functor; - functor(dev_ctx, saved_mean, static_cast>(0)); - functor(dev_ctx, saved_variance, static_cast>(0)); auto handle = dev_ctx.cudnn_handle(); @@ -159,6 +147,21 @@ class BatchNormKernel // Run training mode. // obtain running mean and running inv var, and see if we need to // initialize them. + + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + math::SetConstant> + functor; + functor(dev_ctx, saved_mean, static_cast>(0)); + functor(dev_ctx, saved_variance, static_cast>(0)); + double this_factor = 1. - momentum; CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( From 617e790a596ccd3f2eb940fcfe76803c01ee6cc8 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 12 Apr 2018 11:48:17 -0700 Subject: [PATCH 6/8] fix cuda 7.5 compile error (#9885) --- paddle/fluid/operators/math/math_function.cu | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index e53183603f..c28047e6e9 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -288,9 +288,14 @@ void batched_gemm( // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, "cublas Hgemm requires GPU compute capability >= 53"); + +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> @@ -310,9 +315,13 @@ void batched_gemm( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int strideC = M * N; +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> @@ -332,9 +341,13 @@ void batched_gemm( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int strideC = M * N; +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> From 59234b7287980ef0fec0a064f524e6c25697b7c7 Mon Sep 17 00:00:00 2001 From: redrayqll Date: Fri, 13 Apr 2018 03:25:44 +0800 Subject: [PATCH 7/8] =?UTF-8?q?modify=20=E2=80=9Cif-then-else=E2=80=9D=20m?= =?UTF-8?q?d=20path=20(#9876)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/fluid/design/motivation/fluid.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md index 5e147f8263..4b7696cc1b 100644 --- a/doc/fluid/design/motivation/fluid.md +++ b/doc/fluid/design/motivation/fluid.md @@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. -We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid. +We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid. ## Turing Completeness From c241959e489053259274edb2614381d7058463a4 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 12 Apr 2018 16:45:40 -0700 Subject: [PATCH 8/8] Fix CPPLint errors in operators (#9828) * Fix CPPLint errors in operators * Fix prior box op * Fix Prior Box op * Fix top_k_op.cu * Fix pool mkmldnn * Fix pool mkmldnn --- paddle/fluid/operators/pad_op.h | 2 + paddle/fluid/operators/pool_mkldnn_op.cc | 12 ++- paddle/fluid/operators/pool_op.h | 2 + paddle/fluid/operators/pool_with_index_op.h | 1 + paddle/fluid/operators/prelu_op.cc | 1 - paddle/fluid/operators/prior_box_op.cc | 2 +- paddle/fluid/operators/prior_box_op.cu | 2 +- paddle/fluid/operators/prior_box_op.h | 18 +++-- paddle/fluid/operators/rank_loss_op.cc | 1 + paddle/fluid/operators/recv_op.cc | 2 +- paddle/fluid/operators/roi_pool_op.h | 2 + paddle/fluid/operators/strided_memcpy.h | 4 +- paddle/fluid/operators/top_k_op.cu | 83 +++++++++++---------- 13 files changed, 73 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h index a36abe3789..c93c096575 100644 --- a/paddle/fluid/operators/pad_op.h +++ b/paddle/fluid/operators/pad_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index c88578570c..63eaaedcd5 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + mkldnn::memory({src_md, mkldnn_engine}, + static_cast(const_cast(input_data))); auto dst_memory = - mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); + mkldnn::memory({dst_md, mkldnn_engine}, + static_cast(const_cast(output_data))); auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, *workspace_memory); @@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { pool_bwd_desc, mkldnn_engine, *pool_pd); auto diff_src_memory = - mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); + mkldnn::memory({diff_src_md, mkldnn_engine}, + static_cast(const_cast(in_x_grad_data))); auto diff_dst_memory = - mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); + mkldnn::memory({diff_dst_md, mkldnn_engine}, + static_cast(const_cast(out_grad_data))); auto bwd_prim = mkldnn::pooling_backward( pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index 2fec50ef25..a48127ea69 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h index 83e7bd138a..b55fa76eae 100644 --- a/paddle/fluid/operators/pool_with_index_op.h +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 7fb45bd19d..8eaa12a4a6 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/prelu_op.h" - #include namespace paddle { diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc index 82e54139c8..058b13eeb8 100644 --- a/paddle/fluid/operators/prior_box_op.cc +++ b/paddle/fluid/operators/prior_box_op.cc @@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { bool flip = ctx->Attrs().Get("flip"); std::vector aspect_ratios_vec; - ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); + ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec); size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); if (max_sizes.size() > 0) { diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu index 76bf2b3b7d..0ea8909296 100644 --- a/paddle/fluid/operators/prior_box_op.cu +++ b/paddle/fluid/operators/prior_box_op.cu @@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { auto clip = ctx.Attr("clip"); std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); T step_w = static_cast(ctx.Attr("step_w")); T step_h = static_cast(ctx.Attr("step_h")); diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h index 1e4a12aac1..1c62fd8d2c 100644 --- a/paddle/fluid/operators/prior_box_op.h +++ b/paddle/fluid/operators/prior_box_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/transform.h" @@ -22,23 +24,23 @@ namespace operators { inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, bool flip, - std::vector& output_aspect_ratior) { + std::vector* output_aspect_ratior) { constexpr float epsilon = 1e-6; - output_aspect_ratior.clear(); - output_aspect_ratior.push_back(1.0f); + output_aspect_ratior->clear(); + output_aspect_ratior->push_back(1.0f); for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { float ar = input_aspect_ratior[i]; bool already_exist = false; - for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { - if (fabs(ar - output_aspect_ratior[j]) < epsilon) { + for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { + if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { already_exist = true; break; } } if (!already_exist) { - output_aspect_ratior.push_back(ar); + output_aspect_ratior->push_back(ar); if (flip) { - output_aspect_ratior.push_back(1.0f / ar); + output_aspect_ratior->push_back(1.0f / ar); } } } @@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel { auto clip = ctx.Attr("clip"); std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); T step_w = static_cast(ctx.Attr("step_w")); T step_h = static_cast(ctx.Attr("step_h")); diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 767eef5686..a1127f11a7 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/rank_loss_op.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 083c1fae5e..a4dcf704a6 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include // NOLINT #include #include "paddle/fluid/framework/data_type.h" @@ -19,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include #include "paddle/fluid/operators/detail/grpc_client.h" namespace paddle { diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h index f38c5a3c0c..54e0749031 100644 --- a/paddle/fluid/operators/roi_pool_op.h +++ b/paddle/fluid/operators/roi_pool_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 22c1db82e9..7a10218e15 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& src_stride, const framework::DDim& dst_dim, const framework::DDim& dst_stride, T* dst) { - using namespace detail; - StridedCopyDimVisitor func(dev_ctx, src, src_stride, dst_stride, dst); + paddle::operators::detail::StridedCopyDimVisitor func( + dev_ctx, src, src_stride, dst_stride, dst); boost::apply_visitor(func, dst_dim); } diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index bfd26c2f22..d7f4d383ce 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/assert.h" namespace paddle { @@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, } template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, int beam_size, const T* src, - bool& firstStep, bool& is_empty, - Pair& max, int dim, + bool* firstStep, bool* is_empty, + Pair* max, int dim, const int tid) { - if (beam > 0) { - int length = beam < beam_size ? beam : beam_size; - if (firstStep) { - firstStep = false; + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; GetTopK(topk, src, tid, dim, length); } else { for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - beam) { - topk[k] = topk[k + beam]; + if (k < MaxLength - (*beam)) { + topk[k] = topk[k + *beam]; } else { topk[k].set(-INFINITY, -1); } } - if (!is_empty) { - GetTopK(topk + MaxLength - beam, src, tid, dim, max, + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, src, tid, dim, *max, length); } } - max = topk[MaxLength - 1]; - if (max.v == -1) is_empty = true; - beam = 0; + *max = topk[MaxLength - 1]; + if ((*max).v == -1) *is_empty = true; + *beam = 0; } } template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, int beam_size, const T* val, - int* col, bool& firstStep, - bool& is_empty, Pair& max, + int* col, bool* firstStep, + bool* is_empty, Pair* max, int dim, const int tid) { - if (beam > 0) { - int length = beam < beam_size ? beam : beam_size; - if (firstStep) { - firstStep = false; + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; GetTopK(topk, val, col, tid, dim, length); } else { for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - beam) { - topk[k] = topk[k + beam]; + if (k < MaxLength - *beam) { + topk[k] = topk[k + *beam]; } else { topk[k].set(-INFINITY, -1); } } - if (!is_empty) { - GetTopK(topk + MaxLength - beam, val, col, tid, dim, max, + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, val, col, tid, dim, max, length); } } - max = topk[MaxLength - 1]; - if (max.v == -1) is_empty = true; - beam = 0; + *max = topk[MaxLength - 1]; + if ((*max).v == -1) *is_empty = true; + *beam = 0; } } template __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, Pair topk[], T** topVal, - int64_t** topIds, int& beam, int& k, + int64_t** topIds, int* beam, int* k, const int tid, const int warp) { while (true) { __syncthreads(); @@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, (*topVal)++; (*topIds)++; } - if (tid == maxid[0]) beam++; - if (--k == 0) break; + if (tid == maxid[0]) (*beam)++; + if (--(*k) == 0) break; __syncthreads(); if (tid == maxid[0]) { - if (beam < MaxLength) { - sh_topk[tid] = topk[beam]; + if (*beam < MaxLength) { + sh_topk[tid] = topk[*beam]; } } if (maxid[0] / 32 == warp) { - if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; + if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break; } } } @@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, topk[k].set(-INFINITY, -1); } while (k) { - ThreadGetTopK(topk, beam, k, - src + blockIdx.x * lds, firststep, - is_empty, max, dim, tid); + ThreadGetTopK(topk, &beam, k, + src + blockIdx.x * lds, &firststep, + &is_empty, &max, dim, tid); sh_topk[tid] = topk[0]; BlockReduce(sh_topk, maxid, topk, &output, - &indices, beam, k, tid, warp); + &indices, &beam, &k, tid, warp); } } @@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel { KeMatrixTopK<<< grid, threads, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(output_data, output->dims()[1], - indices_data, input_data, - input_width, input_width, int(k)); + .stream()>>>( + output_data, output->dims()[1], indices_data, input_data, input_width, + input_width, static_cast(k)); } };