diff --git a/CMakeLists.txt b/CMakeLists.txt index b35290e12f..4117f07721 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) +option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) # CMAKE_BUILD_TYPE @@ -193,7 +194,10 @@ set(EXTERNAL_LIBS if(WITH_GPU) include(cuda) include(tensorrt) -endif(WITH_GPU) + include(external/anakin) +else() + set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) +endif() if(WITH_AMD_GPU) find_package(HIP) diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index a3b81d8205..12edcde148 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -188,7 +188,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, print_train_time(start_time, time.time(), num_samples) print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))), # evaluation - if not args.no_test and batch_acc: + if not args.no_test and batch_acc and not args.use_reader_op: pass_test_acc = test(exe, infer_prog, test_reader, feeder, batch_acc) print(", Test Accuracy: %f" % pass_test_acc) @@ -285,11 +285,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_id += 1 print_train_time(start_time, time.time(), num_samples) - if not args.no_test and batch_acc: + if not args.no_test and batch_acc and not args.use_reader_op: + # we have not implement record io for test + # skip test when use args.use_reader_op test_acc = test(startup_exe, infer_prog, test_reader, feeder, batch_acc) print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) - exit(0) def print_arguments(args): diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 2ee2b5be09..9ed1093c54 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -199,7 +199,10 @@ def get_model(args): batched_train_reader = paddle.batch( paddle.reader.shuffle( train_reader, buf_size=5120), - batch_size=args.batch_size * args.gpus) - batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size) + batch_size=args.batch_size * args.gpus, + drop_last=True) + batched_test_reader = paddle.batch( + train_reader, batch_size=args.batch_size, drop_last=True) - return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc + return avg_cost, inference_program, optimizer, batched_train_reader,\ + batched_test_reader, batch_acc diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 89d7a62fe9..6a8b15a6b6 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -118,6 +118,10 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") +if(WITH_DISTRIBUTE) + add_definitions(-DPADDLE_WITH_DISTRIBUTE) +endif() + if(WITH_GOLANG) # we need to symlink Paddle directory into GOPATH. If we # don't do it and we have code that depends on Paddle, go diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake new file mode 100644 index 0000000000..f1cd9c99eb --- /dev/null +++ b/cmake/external/anakin.cmake @@ -0,0 +1,42 @@ +if (NOT WITH_ANAKIN) + return() +endif() + +set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH + "Anakin install path." FORCE) +set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files") +set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") + +set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp) + +set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz") + +# A helper function used in Anakin, currently, to use it, one need to recursively include +# nearly all the header files. +function(fetch_include_recursively root_dir) + if (IS_DIRECTORY ${root_dir}) + include_directories(${root_dir}) + endif() + + file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) + foreach(sub ${ALL_SUB}) + if (IS_DIRECTORY ${root_dir}/${sub}) + fetch_include_recursively(${root_dir}/${sub}) + endif() + endforeach() +endfunction() + +# download library +message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") +execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") +execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}") +execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") +execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz") + +if (WITH_ANAKIN) + message(STATUS "Anakin for inference is enabled") + message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") + fetch_include_recursively(${ANAKIN_INCLUDE}) + link_directories(${ANAKIN_LIBRARY}) +endif() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 8af2765f58..4a49a92f2b 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND}) "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) + ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_COMMIT "v0.2.20") diff --git a/doc/fluid/api/detection.rst b/doc/fluid/api/detection.rst new file mode 100644 index 0000000000..e69de29bb2 diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh index 0f05393555..27f2419c06 100755 --- a/doc/fluid/api/gen_doc.sh +++ b/doc/fluid/api/gen_doc.sh @@ -1,5 +1,5 @@ #!/bin/bash -python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst +python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer do diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst index 3e956f8302..dd9d88b669 100644 --- a/doc/fluid/api/io.rst +++ b/doc/fluid/api/io.rst @@ -59,21 +59,3 @@ get_inference_program .. autofunction:: paddle.fluid.io.get_inference_program :noindex: -save_checkpoint ---------------- - -.. autofunction:: paddle.fluid.io.save_checkpoint - :noindex: - -load_checkpoint ---------------- - -.. autofunction:: paddle.fluid.io.load_checkpoint - :noindex: - -clean_checkpoint ----------------- - -.. autofunction:: paddle.fluid.io.clean_checkpoint - :noindex: - diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst index f78e6db326..e5ced9c04c 100644 --- a/doc/fluid/api/layers.rst +++ b/doc/fluid/api/layers.rst @@ -181,12 +181,6 @@ Print .. autofunction:: paddle.fluid.layers.Print :noindex: -is_empty --------- - -.. autofunction:: paddle.fluid.layers.is_empty - :noindex: - device ====== @@ -261,19 +255,6 @@ double_buffer .. autofunction:: paddle.fluid.layers.double_buffer :noindex: -random_data_generator ---------------------- - -.. autofunction:: paddle.fluid.layers.random_data_generator - :noindex: - -Preprocessor ------------- - -.. autoclass:: paddle.fluid.layers.Preprocessor - :members: - :noindex: - nn == @@ -613,30 +594,6 @@ roi_pool .. autofunction:: paddle.fluid.layers.roi_pool :noindex: -dice_loss ---------- - -.. autofunction:: paddle.fluid.layers.dice_loss - :noindex: - -resize_bilinear ---------------- - -.. autofunction:: paddle.fluid.layers.resize_bilinear - :noindex: - -gather ------- - -.. autofunction:: paddle.fluid.layers.gather - :noindex: - -random_crop ------------ - -.. autofunction:: paddle.fluid.layers.random_crop - :noindex: - ops === @@ -784,12 +741,6 @@ sum .. autofunction:: paddle.fluid.layers.sum :noindex: -shape ------ - -.. autofunction:: paddle.fluid.layers.shape - :noindex: - sigmoid ------- @@ -1039,3 +990,93 @@ zeros .. autofunction:: paddle.fluid.layers.zeros :noindex: +detection +========= + +multi_box_head +-------------- + +.. autofunction:: paddle.fluid.layers.multi_box_head + :noindex: + +bipartite_match +--------------- + +.. autofunction:: paddle.fluid.layers.bipartite_match + :noindex: + +target_assign +------------- + +.. autofunction:: paddle.fluid.layers.target_assign + :noindex: + +detection_output +---------------- + +.. autofunction:: paddle.fluid.layers.detection_output + :noindex: + +ssd_loss +-------- + +.. autofunction:: paddle.fluid.layers.ssd_loss + :noindex: + +detection_map +------------- + +.. autofunction:: paddle.fluid.layers.detection_map + :noindex: + +iou_similarity +-------------- + +.. autofunction:: paddle.fluid.layers.iou_similarity + :noindex: + +box_coder +--------- + +.. autofunction:: paddle.fluid.layers.box_coder + :noindex: + +learning_rate_scheduler +======================= + +exponential_decay +----------------- + +.. autofunction:: paddle.fluid.layers.exponential_decay + :noindex: + +natural_exp_decay +----------------- + +.. autofunction:: paddle.fluid.layers.natural_exp_decay + :noindex: + +inverse_time_decay +------------------ + +.. autofunction:: paddle.fluid.layers.inverse_time_decay + :noindex: + +polynomial_decay +---------------- + +.. autofunction:: paddle.fluid.layers.polynomial_decay + :noindex: + +piecewise_decay +--------------- + +.. autofunction:: paddle.fluid.layers.piecewise_decay + :noindex: + +noam_decay +---------- + +.. autofunction:: paddle.fluid.layers.noam_decay + :noindex: + diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst index 6ad44bb690..79a0995fce 100644 --- a/doc/fluid/api/optimizer.rst +++ b/doc/fluid/api/optimizer.rst @@ -89,13 +89,6 @@ DecayedAdagradOptimizer :members: :noindex: -RMSPropOptimizer ----------------- - -.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer - :members: - :noindex: - Adadelta -------- diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst index 39fda65863..74d102dcb0 100644 --- a/doc/fluid/api/profiler.rst +++ b/doc/fluid/api/profiler.rst @@ -23,15 +23,3 @@ profiler .. autofunction:: paddle.fluid.profiler.profiler :noindex: -start_profiler --------------- - -.. autofunction:: paddle.fluid.profiler.start_profiler - :noindex: - -stop_profiler -------------- - -.. autofunction:: paddle.fluid.profiler.stop_profiler - :noindex: - diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md index 553a9dbe15..6b80b014b1 100644 --- a/doc/survey/dynamic_graph.md +++ b/doc/survey/dynamic_graph.md @@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr ## What can fluid learn from them? -TBD +Please refer to `paddle/contrib/dynamic/`. # Appendix diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst index 9ac972fb19..458d892e82 100644 --- a/doc/v2/api/config/evaluators.rst +++ b/doc/v2/api/config/evaluators.rst @@ -101,7 +101,7 @@ value_printer :noindex: Detection -===== +========== detection_map ------------- diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst index 1a6496968c..5a0cfadfce 100644 --- a/doc/v2/api/config/layer.rst +++ b/doc/v2/api/config/layer.rst @@ -11,7 +11,7 @@ Data layer data ---- -.. autoclass:: paddle.v2.layer.data +.. autofunction:: paddle.v2.layer.data :noindex: Fully Connected Layers @@ -21,12 +21,12 @@ Fully Connected Layers fc -- -.. autoclass:: paddle.v2.layer.fc +.. autofunction:: paddle.v2.layer.fc :noindex: selective_fc ------------ -.. autoclass:: paddle.v2.layer.selective_fc +.. autofunction:: paddle.v2.layer.selective_fc :noindex: Conv Layers @@ -34,34 +34,34 @@ Conv Layers conv_operator ------------- -.. autoclass:: paddle.v2.layer.conv_operator +.. autofunction:: paddle.v2.layer.conv_operator :noindex: conv_projection --------------- -.. autoclass:: paddle.v2.layer.conv_projection +.. autofunction:: paddle.v2.layer.conv_projection :noindex: conv_shift ---------- -.. autoclass:: paddle.v2.layer.conv_shift +.. autofunction:: paddle.v2.layer.conv_shift :noindex: img_conv -------- -.. autoclass:: paddle.v2.layer.img_conv +.. autofunction:: paddle.v2.layer.img_conv :noindex: .. _api_v2.layer_context_projection: context_projection ------------------ -.. autoclass:: paddle.v2.layer.context_projection +.. autofunction:: paddle.v2.layer.context_projection :noindex: row_conv -------- -.. autoclass:: paddle.v2.layer.row_conv +.. autofunction:: paddle.v2.layer.row_conv :noindex: Image Pooling Layer @@ -69,27 +69,27 @@ Image Pooling Layer img_pool -------- -.. autoclass:: paddle.v2.layer.img_pool +.. autofunction:: paddle.v2.layer.img_pool :noindex: spp --- -.. autoclass:: paddle.v2.layer.spp +.. autofunction:: paddle.v2.layer.spp :noindex: maxout ------ -.. autoclass:: paddle.v2.layer.maxout +.. autofunction:: paddle.v2.layer.maxout :noindex: roi_pool -------- -.. autoclass:: paddle.v2.layer.roi_pool +.. autofunction:: paddle.v2.layer.roi_pool :noindex: pad ---- -.. autoclass:: paddle.v2.layer.pad +.. autofunction:: paddle.v2.layer.pad :noindex: Norm Layer @@ -97,27 +97,27 @@ Norm Layer img_cmrnorm ----------- -.. autoclass:: paddle.v2.layer.img_cmrnorm +.. autofunction:: paddle.v2.layer.img_cmrnorm :noindex: batch_norm ---------- -.. autoclass:: paddle.v2.layer.batch_norm +.. autofunction:: paddle.v2.layer.batch_norm :noindex: sum_to_one_norm --------------- -.. autoclass:: paddle.v2.layer.sum_to_one_norm +.. autofunction:: paddle.v2.layer.sum_to_one_norm :noindex: cross_channel_norm ------------------ -.. autoclass:: paddle.v2.layer.cross_channel_norm +.. autofunction:: paddle.v2.layer.cross_channel_norm :noindex: row_l2_norm ----------- -.. autoclass:: paddle.v2.layer.row_l2_norm +.. autofunction:: paddle.v2.layer.row_l2_norm :noindex: Recurrent Layers @@ -125,22 +125,22 @@ Recurrent Layers recurrent --------- -.. autoclass:: paddle.v2.layer.recurrent +.. autofunction:: paddle.v2.layer.recurrent :noindex: lstmemory --------- -.. autoclass:: paddle.v2.layer.lstmemory +.. autofunction:: paddle.v2.layer.lstmemory :noindex: grumemory --------- -.. autoclass:: paddle.v2.layer.grumemory +.. autofunction:: paddle.v2.layer.grumemory :noindex: gated_unit ----------- -.. autoclass:: paddle.v2.layer.gated_unit +.. autofunction:: paddle.v2.layer.gated_unit :noindex: Recurrent Layer Group @@ -148,32 +148,32 @@ Recurrent Layer Group memory ------ -.. autoclass:: paddle.v2.layer.memory +.. autofunction:: paddle.v2.layer.memory :noindex: recurrent_group --------------- -.. autoclass:: paddle.v2.layer.recurrent_group +.. autofunction:: paddle.v2.layer.recurrent_group :noindex: lstm_step --------- -.. autoclass:: paddle.v2.layer.lstm_step +.. autofunction:: paddle.v2.layer.lstm_step :noindex: gru_step -------- -.. autoclass:: paddle.v2.layer.gru_step +.. autofunction:: paddle.v2.layer.gru_step :noindex: beam_search ------------ -.. autoclass:: paddle.v2.layer.beam_search +.. autofunction:: paddle.v2.layer.beam_search :noindex: get_output ---------- -.. autoclass:: paddle.v2.layer.get_output +.. autofunction:: paddle.v2.layer.get_output :noindex: Mixed Layer @@ -183,54 +183,54 @@ Mixed Layer mixed ----- -.. autoclass:: paddle.v2.layer.mixed +.. autofunction:: paddle.v2.layer.mixed :noindex: .. _api_v2.layer_embedding: embedding --------- -.. autoclass:: paddle.v2.layer.embedding +.. autofunction:: paddle.v2.layer.embedding :noindex: scaling_projection ------------------ -.. autoclass:: paddle.v2.layer.scaling_projection +.. autofunction:: paddle.v2.layer.scaling_projection :noindex: dotmul_projection ----------------- -.. autoclass:: paddle.v2.layer.dotmul_projection +.. autofunction:: paddle.v2.layer.dotmul_projection :noindex: dotmul_operator --------------- -.. autoclass:: paddle.v2.layer.dotmul_operator +.. autofunction:: paddle.v2.layer.dotmul_operator :noindex: full_matrix_projection ---------------------- -.. autoclass:: paddle.v2.layer.full_matrix_projection +.. autofunction:: paddle.v2.layer.full_matrix_projection :noindex: identity_projection ------------------- -.. autoclass:: paddle.v2.layer.identity_projection +.. autofunction:: paddle.v2.layer.identity_projection :noindex: slice_projection ------------------- -.. autoclass:: paddle.v2.layer.slice_projection +.. autofunction:: paddle.v2.layer.slice_projection :noindex: table_projection ---------------- -.. autoclass:: paddle.v2.layer.table_projection +.. autofunction:: paddle.v2.layer.table_projection :noindex: trans_full_matrix_projection ---------------------------- -.. autoclass:: paddle.v2.layer.trans_full_matrix_projection +.. autofunction:: paddle.v2.layer.trans_full_matrix_projection :noindex: Aggregate Layers @@ -245,51 +245,46 @@ AggregateLevel pooling ------- -.. autoclass:: paddle.v2.layer.pooling +.. autofunction:: paddle.v2.layer.pooling :noindex: .. _api_v2.layer_last_seq: last_seq -------- -.. autoclass:: paddle.v2.layer.last_seq +.. autofunction:: paddle.v2.layer.last_seq :noindex: .. _api_v2.layer_first_seq: first_seq --------- -.. autoclass:: paddle.v2.layer.first_seq +.. autofunction:: paddle.v2.layer.first_seq :noindex: sub_seq --------- -.. autoclass:: paddle.v2.layer.sub_seq +.. autofunction:: paddle.v2.layer.sub_seq :noindex: concat ------ -.. autoclass:: paddle.v2.layer.concat +.. autofunction:: paddle.v2.layer.concat :noindex: seq_concat ---------- -.. autoclass:: paddle.v2.layer.seq_concat +.. autofunction:: paddle.v2.layer.seq_concat :noindex: seq_slice --------- -.. autoclass:: paddle.v2.layer.seq_slice - :noindex: - -kmax_sequence_score -------------------- -.. autoclass:: paddle.v2.layer.kmax_sequence_score +.. autofunction:: paddle.v2.layer.seq_slice :noindex: sub_nested_seq -------------- -.. autoclass:: paddle.v2.layer.sub_nested_seq +.. autofunction:: paddle.v2.layer.sub_nested_seq :noindex: Reshaping Layers @@ -297,7 +292,7 @@ Reshaping Layers block_expand ------------ -.. autoclass:: paddle.v2.layer.block_expand +.. autofunction:: paddle.v2.layer.block_expand :noindex: .. _api_v2.layer_expand: @@ -309,22 +304,22 @@ ExpandLevel expand ------ -.. autoclass:: paddle.v2.layer.expand +.. autofunction:: paddle.v2.layer.expand :noindex: repeat ------ -.. autoclass:: paddle.v2.layer.repeat +.. autofunction:: paddle.v2.layer.repeat :noindex: rotate ------ -.. autoclass:: paddle.v2.layer.rotate +.. autofunction:: paddle.v2.layer.rotate :noindex: seq_reshape ----------- -.. autoclass:: paddle.v2.layer.seq_reshape +.. autofunction:: paddle.v2.layer.seq_reshape :noindex: Math Layers @@ -332,94 +327,94 @@ Math Layers addto ----- -.. autoclass:: paddle.v2.layer.addto +.. autofunction:: paddle.v2.layer.addto :noindex: linear_comb ----------- -.. autoclass:: paddle.v2.layer.linear_comb +.. autofunction:: paddle.v2.layer.linear_comb :noindex: interpolation ------------- -.. autoclass:: paddle.v2.layer.interpolation +.. autofunction:: paddle.v2.layer.interpolation :noindex: bilinear_interp --------------- -.. autoclass:: paddle.v2.layer.bilinear_interp +.. autofunction:: paddle.v2.layer.bilinear_interp :noindex: dropout -------- -.. autoclass:: paddle.v2.layer.dropout +.. autofunction:: paddle.v2.layer.dropout :noindex: dot_prod --------- -.. autoclass:: paddle.v2.layer.dot_prod +.. autofunction:: paddle.v2.layer.dot_prod :noindex: out_prod -------- -.. autoclass:: paddle.v2.layer.out_prod +.. autofunction:: paddle.v2.layer.out_prod :noindex: power ----- -.. autoclass:: paddle.v2.layer.power +.. autofunction:: paddle.v2.layer.power :noindex: scaling ------- -.. autoclass:: paddle.v2.layer.scaling +.. autofunction:: paddle.v2.layer.scaling :noindex: clip ---- -.. autoclass:: paddle.v2.layer.clip +.. autofunction:: paddle.v2.layer.clip :noindex: resize ------ -.. autoclass:: paddle.v2.layer.resize +.. autofunction:: paddle.v2.layer.resize :noindex: slope_intercept --------------- -.. autoclass:: paddle.v2.layer.slope_intercept +.. autofunction:: paddle.v2.layer.slope_intercept :noindex: tensor ------ -.. autoclass:: paddle.v2.layer.tensor +.. autofunction:: paddle.v2.layer.tensor :noindex: .. _api_v2.layer_cos_sim: cos_sim ------- -.. autoclass:: paddle.v2.layer.cos_sim +.. autofunction:: paddle.v2.layer.cos_sim :noindex: l2_distance ----------- -.. autoclass:: paddle.v2.layer.l2_distance +.. autofunction:: paddle.v2.layer.l2_distance :noindex: trans ----- -.. autoclass:: paddle.v2.layer.trans +.. autofunction:: paddle.v2.layer.trans :noindex: scale_shift ----------- -.. autoclass:: paddle.v2.layer.scale_shift +.. autofunction:: paddle.v2.layer.scale_shift :noindex: factorization_machine --------------------- -.. autoclass:: paddle.v2.layer.factorization_machine +.. autofunction:: paddle.v2.layer.factorization_machine :noindex: Sampling Layers @@ -427,17 +422,17 @@ Sampling Layers maxid ----- -.. autoclass:: paddle.v2.layer.max_id +.. autofunction:: paddle.v2.layer.max_id :noindex: sampling_id ----------- -.. autoclass:: paddle.v2.layer.sampling_id +.. autofunction:: paddle.v2.layer.sampling_id :noindex: multiplex --------- -.. autoclass:: paddle.v2.layer.multiplex +.. autofunction:: paddle.v2.layer.multiplex :noindex: .. _api_v2.layer_costs: @@ -447,97 +442,97 @@ Cost Layers cross_entropy_cost ------------------ -.. autoclass:: paddle.v2.layer.cross_entropy_cost +.. autofunction:: paddle.v2.layer.cross_entropy_cost :noindex: cross_entropy_with_selfnorm_cost -------------------------------- -.. autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost +.. autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost :noindex: multi_binary_label_cross_entropy_cost ------------------------------------- -.. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost +.. autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: classification_cost ------------------- -.. autoclass:: paddle.v2.layer.classification_cost +.. autofunction:: paddle.v2.layer.classification_cost :noindex: huber_regression_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_regression_cost +.. autofunction:: paddle.v2.layer.huber_regression_cost :noindex: huber_classification_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_classification_cost +.. autofunction:: paddle.v2.layer.huber_classification_cost :noindex: lambda_cost ----------- -.. autoclass:: paddle.v2.layer.lambda_cost +.. autofunction:: paddle.v2.layer.lambda_cost :noindex: square_error_cost ----------------- -.. autoclass:: paddle.v2.layer.square_error_cost +.. autofunction:: paddle.v2.layer.square_error_cost :noindex: rank_cost --------- -.. autoclass:: paddle.v2.layer.rank_cost +.. autofunction:: paddle.v2.layer.rank_cost :noindex: sum_cost --------- -.. autoclass:: paddle.v2.layer.sum_cost +.. autofunction:: paddle.v2.layer.sum_cost :noindex: crf --- -.. autoclass:: paddle.v2.layer.crf +.. autofunction:: paddle.v2.layer.crf :noindex: crf_decoding ------------ -.. autoclass:: paddle.v2.layer.crf_decoding +.. autofunction:: paddle.v2.layer.crf_decoding :noindex: ctc --- -.. autoclass:: paddle.v2.layer.ctc +.. autofunction:: paddle.v2.layer.ctc :noindex: warp_ctc -------- -.. autoclass:: paddle.v2.layer.warp_ctc +.. autofunction:: paddle.v2.layer.warp_ctc :noindex: nce --- -.. autoclass:: paddle.v2.layer.nce +.. autofunction:: paddle.v2.layer.nce :noindex: hsigmoid --------- -.. autoclass:: paddle.v2.layer.hsigmoid +.. autofunction:: paddle.v2.layer.hsigmoid :noindex: smooth_l1_cost -------------- -.. autoclass:: paddle.v2.layer.smooth_l1_cost +.. autofunction:: paddle.v2.layer.smooth_l1_cost :noindex: multibox_loss -------------- -.. autoclass:: paddle.v2.layer.multibox_loss +.. autofunction:: paddle.v2.layer.multibox_loss :noindex: detection_output ---------------- -.. autoclass:: paddle.v2.layer.detection_output +.. autofunction:: paddle.v2.layer.detection_output :noindex: Check Layer @@ -545,7 +540,7 @@ Check Layer eos --- -.. autoclass:: paddle.v2.layer.eos +.. autofunction:: paddle.v2.layer.eos :noindex: Activation @@ -553,5 +548,5 @@ Activation prelu -------- -.. autoclass:: paddle.v2.layer.prelu +.. autofunction:: paddle.v2.layer.prelu :noindex: diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst index b11cd449af..70c5c524aa 100644 --- a/doc/v2/api/index_en.rst +++ b/doc/v2/api/index_en.rst @@ -8,4 +8,3 @@ API model_configs.rst data.rst run_logic.rst - fluid/index.rst diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index 853bdb21bb..095da19cd4 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版 "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index fecf6d3712..8406e4aa1f 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt index 4b19256ef4..70e3a0583d 100644 --- a/paddle/contrib/CMakeLists.txt +++ b/paddle/contrib/CMakeLists.txt @@ -14,3 +14,4 @@ # add_subdirectory(inference) +add_subdirectory(tape) diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt index f279020e93..277b0b175b 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/contrib/inference/CMakeLists.txt @@ -17,48 +17,9 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) -set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files") -set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library") - set(inference_deps paddle_inference_api paddle_fluid_api) -# if anakin is set enable anakin api implementation -if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY) - set(ANAKIN_FOUND ON) -else() - set(ANAKIN_FOUND OFF) -endif() - -function(fetch_include_recursively root_dir) - if (IS_DIRECTORY ${root_dir}) - include_directories(${root_dir}) - endif() - - file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) - foreach(sub ${ALL_SUB}) - if (IS_DIRECTORY ${root_dir}/${sub}) - fetch_include_recursively(${root_dir}/${sub}) - endif() - endforeach() -endfunction() - -if (ANAKIN_FOUND) - # Anakin's code style doesn't follow google c style. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp") - - message(STATUS "Anakin for inference is enabled") - message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") - fetch_include_recursively(${ANAKIN_INCLUDE}) - - link_directories(${ANAKIN_LIBRARY}) - - nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) - target_link_libraries(inference_anakin_api anakin anakin_saber_common) - list(APPEND inference_deps inference_anakin_api) -endif() - - function(inference_api_test TARGET_NAME) if (WITH_TESTING) set(options "") @@ -79,7 +40,7 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(paddle_inference_api - SRCS paddle_inference_api.cc paddle_inference_api_impl.cc + SRCS paddle_inference_api.cc paddle_inference_api_impl.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) cc_test(test_paddle_inference_api @@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api inference_api_test(test_paddle_inference_api_impl ARGS test_word2vec test_image_classification) -if (ANAKIN_FOUND) +if (WITH_ANAKIN) + # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, + # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to + # compile the libinference_anakin_api.a and compile with anakin.so. + nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) + target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + target_link_libraries(inference_anakin_api anakin anakin_saber_common) cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc - DEPS ${inference_deps}) + ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api) + target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif() if(WITH_TESTING) diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc index ea7781f691..5bafc58fa5 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h" +#include namespace paddle { diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h index 181784cbdf..212ba41cdf 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h @@ -19,10 +19,9 @@ limitations under the License. */ #pragma once -// NOTE This header file do not have namespace. -//#include #include "paddle/contrib/inference/paddle_inference_api.h" +// from anakin #include "framework/core/net/net.h" #include "saber/saber_types.h" diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc index 47b9c6fa28..1d41a5c73e 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc @@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include -#include "gflags/gflags.h" #include "paddle/contrib/inference/paddle_inference_api.h" +DEFINE_string(model, "", "Directory of the inference model."); + namespace paddle { AnakinConfig GetConfig() { AnakinConfig config; - config.model_file = "./mobilenet_v2.anakin.bin"; + config.model_file = FLAGS_model; config.device = 0; config.max_batch_size = 1; return config; diff --git a/paddle/contrib/tape/CMakeLists.txt b/paddle/contrib/tape/CMakeLists.txt new file mode 100644 index 0000000000..0acef17d6a --- /dev/null +++ b/paddle/contrib/tape/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if(APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") +endif(APPLE) + +cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES}) +cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable) + +cc_test(test_tape + SRCS test_tape.cc + DEPS tape tape_variable) diff --git a/paddle/contrib/tape/README.md b/paddle/contrib/tape/README.md new file mode 100644 index 0000000000..16c22a45d5 --- /dev/null +++ b/paddle/contrib/tape/README.md @@ -0,0 +1,252 @@ +# Dynamic Graph on Fluid + +PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very +challenging and we are still way from there. DyNet and PyTorch provide a good design +idea, the *tape*, that significantly eases the challenge. Also, DyNet provides +a C++ API that is as convenient as Python but with higher efficiency and could +conveniently integrate with industrial/production systems. This package, `tape`, +combines the good of + +1. tape from PyTorch and DyNet +2. C++ API and core from DyNet +3. rich set of operators from PaddlePaddle + +## Overview + +We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md)) +by wrapping Paddle Fluid's `Operator` and `Variable`. + +The user API is straight forward since + +1. it is imperative. And it uses host language's control flow logic. +1. it avoids extra concepts such as `Scope` and `Executor`. + +All of these benefits come at the cost of just adding one line `reset_global_tape` +at every iteration. + +## Code Structure + +In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its +`type`, the pointers to the `Variable`s, and necessary attributes. + +```c++ +class Variable { +public: + VriableHandle Grad(); // returns its gradient variable +private: + framework::VarDesc desc_; // compile time infershape, necessary for lazy execution + framework::Variable var_; // run time variable, holds data memory +}; + +using VariableHandle = shared_ptr; + +struct OpHandle { + string type_; + map> inputs_; + map> outputs_; + AttributeMap attrs_; +}; + +class Tape { +public: + void AddOp(OpHandle); // add op + void Forward(); // execute the tape_ + void Backward(); // execute the backward of the tape_ +private: + vector tape_; +}; +``` + +We uses `Function` to indicate layers. It takes care of parameter +initialization and `AddOp` to the Tape when it is called. + +```c++ +class Linear { + public: + Linear(int in_dim, int out_dim, const std::string &act) + : w_(new Variable("LinearWeight")), + b_(new Variable("LinearBias")), + act_(act) { + Tape init_tape; + + std::string initializer = "fill_constant"; + framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{in_dim, out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs); + + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs); + + init_tape.Forward(); + } + + VariableHandle operator()(VariableHandle input) { + VariableHandle pre_bias(new Variable("linear")); + get_global_tape().AddOp("mul", + {{"X", {input}}, {"Y", {w_}}}, + {{"Out", {pre_bias}}}, + {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}}); + VariableHandle pre_act(new Variable("linear")); + get_global_tape().AddOp("elementwise_add", + {{"X", {pre_bias}}, {"Y", {b_}}}, + {{"Out", {pre_act}}}, + {{"axis", 1}}); + VariableHandle post_act(new Variable("linear")); + get_global_tape().AddOp(act_, + {{"X", {pre_act}}}, + {{"Out", {post_act}}}, + {}); + return post_act; + } + + std::vector Params() { return {w_, b_}; } + + private: + VariableHandle w_; + VariableHandle b_; + std::string act_; +}; +``` + +## User API + +```c++ +// Model function +paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias +paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias +paddle::tape::Mean mean; + +// Optimizer +paddle::tape::SGD sgd(0.001); + +// Data Feeder +paddle::tape::Fill data_feeder(...); +VariableHandle input(new paddle::tape::Variable("input")); +VariableHandle label(new paddle::tape::Variable("label")); + +for (int i = 0; i < 2; ++i) { + reset_global_tape(); + + data_feeder(input, label); + + auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType + LOG(INFO) << loss.value(); // Run forward up to loss + + // Run backward, store gradient of w at w->Grad() + get_global_tape.Backward(loss); + + // Update w + sgd(linear1.Params()); + sgd(linear2.Params()); +} +``` + +
+ +digraph G { + + subgraph cluster_0 { + node [shape=record,style=filled]; + style=filled; + color=lightgrey; + linear1 [label="{type: mul | {input | {X: before_mul1 | Y: weight1}} | {output | Out: before_bias1}}"]; + elementwise_add1 [label="{type: elementwise_add | {input | {X: before_bias1 | Y: bias1}} | {output | Out: before_act1}}"]; + relu1 [label="{type: relu | {input | {X: before_act1 }} | {output | Out: after_act1}}"]; + + linear1 -> elementwise_add1->relu1; + label = "forward tape"; + } + + linear1:before_mul1->before_mul1 + linear1:weight1->weight1 + linear1:before_bias1->before_bias1 + + elementwise_add1:bias1->bias1 + elementwise_add1:before_bias1->before_bias1 + elementwise_add1:before_act1->before_act1 + + relu1:before_act1->before_act1 + relu1:after_act1->after_act1 + + subgraph cluster_1 { + node [shape=record,style=filled]; + style=filled; + color=lightgrey; + linear1_grad [label="{type: mul_grad | {input | {X: before_mul1 | Y: weight1| Out_grad: before_bias1_grad}} | {output |{X_grad: before_mul1_grad | Y_grad: weight1_grad}}}"]; + + elementwise_add1_grad [label="{type: elementwise_add_grad | {input | Out_grad: before_act1_grad} | {output |{X_grad: before_bias1_grad | Y_grad: bias1_grad}}}"]; + + relu1_grad [label="{type: relu_grad | {input | Out_grad: after_act1_grad} | {ouput | {X_grad: before_act1_grad }}}"]; + + linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back]; + label = "backward tape"; + } + + relu1_grad:after_act1_grad->after_act1_grad + relu1_grad:before_act1_grad->before_act1_grad + + elementwise_add1_grad:before_act1_grad->before_act1_grad + elementwise_add1_grad:before_bias1_grad->before_bias1_grad + elementwise_add1_grad:bias1_grad->bias1_grad + + linear1_grad:before_mul1->before_mul1 + linear1_grad:weight1->weight1 + linear1_grad:before_bias1_grad->before_bias1_grad + linear1_grad:before_mul1_grad->before_mul1_grad + linear1_grad:weight1_grad->weight1_grad + + + subgraph cluster_2 { + node [shape=record]; + label = "Linear1"; + weight1 + bias1 + } + + weight1 -> weight1_grad [ label="Grad()", style="dashed" ]; + bias1 -> bias1_grad [ label="Grad()", style="dashed"]; + + + +} +
+ +![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png) + +## Code Reuse + +We want to stay close to Paddle Fluid as much as possible. + +### Reuse All Operators + +As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function` +is about 10 lines of code, similar to expose an operator to Python. + +### Reuse Compile Time InferShape and InferVarType + +Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead +of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and +`InferVarType` every time we `AddOp` to the tape. + +### Reuse Operator::Run + +We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary +`Scope` for every `Operator::Run()`. + +## Possible Feature + +### Release Memory on Backward + +We can release memory aggressively. During backward, we can delete the OpHandle once +we have finished its backward. Since all the variable is managed by smart pointer, the +memory is automatically released when its `ref_count` goes to 0. + +### Kernel Fusion + +As a symbolic representation of the Tape is constructed first before the actual +execution, it would be possible to perform graph optimization. One use case is kernel +fusion. diff --git a/paddle/contrib/tape/computation_graph.png b/paddle/contrib/tape/computation_graph.png new file mode 100644 index 0000000000..6cf5ead735 Binary files /dev/null and b/paddle/contrib/tape/computation_graph.png differ diff --git a/paddle/contrib/tape/function.h b/paddle/contrib/tape/function.h new file mode 100644 index 0000000000..8c9694d9a2 --- /dev/null +++ b/paddle/contrib/tape/function.h @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/contrib/tape/tape.h" +#include "paddle/contrib/tape/variable.h" +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace tape { + +class Function {}; + +class Fill { + public: + Fill(const std::string &initializer, const framework::AttributeMap &attrs) + : initializer_(initializer), attrs_(attrs) {} + + void operator()(VariableHandle var) { + get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_); + } + + private: + const std::string initializer_; + const framework::AttributeMap attrs_; +}; + +class Mean { + public: + VariableHandle operator()(VariableHandle var) { + VariableHandle out(new Variable("mean")); + get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {}); + return out; + } +}; + +class Linear { + public: + Linear(int in_dim, int out_dim, const std::string &act) + : w_(new Variable("LinearWeight")), + b_(new Variable("LinearBias")), + act_(act) { + Tape init_tape; + + std::string initializer = "fill_constant"; + framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{in_dim, out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs); + + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{out_dim}; + attrs["value"] = 1.0f; + init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs); + + init_tape.Forward(); + } + + VariableHandle operator()(VariableHandle input) { + VariableHandle pre_bias(new Variable("linear")); + get_global_tape().AddOp("mul", + {{"X", {input}}, {"Y", {w_}}}, + {{"Out", {pre_bias}}}, + {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}}); + VariableHandle pre_act(new Variable("linear")); + get_global_tape().AddOp("elementwise_add", + {{"X", {pre_bias}}, {"Y", {b_}}}, + {{"Out", {pre_act}}}, + {{"axis", 1}}); + VariableHandle post_act(new Variable("linear")); + get_global_tape().AddOp( + act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {}); + return post_act; + } + + std::vector Params() { return {w_, b_}; } + + private: + VariableHandle w_; + VariableHandle b_; + std::string act_; +}; + +class SGD { + public: + SGD(float learning_rate) : learning_rate_(new Variable("sgd")) { + Tape init_tape; + + std::string initializer = "fill_constant"; + framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{1}; + attrs["value"] = learning_rate; + init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs); + + init_tape.Forward(); + } + + void operator()(VariableHandle input) { + PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(), + "optimization must happen after the backward"); + Tape temp_tape; + temp_tape.AddOp("sgd", + {{"Param", {input}}, + {"LearningRate", {learning_rate_}}, + {"Grad", {input->Grad()}}}, + {{"ParamOut", {input}}}, + {}); + temp_tape.Forward(); + } + + private: + VariableHandle learning_rate_; +}; +} +} diff --git a/paddle/contrib/tape/tape.cc b/paddle/contrib/tape/tape.cc new file mode 100644 index 0000000000..531499b6fe --- /dev/null +++ b/paddle/contrib/tape/tape.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/contrib/tape/tape.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/dim.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/pybind/pybind.h" + +namespace paddle { +namespace tape { + +// borrowed from +// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c +inline bool ends_with(std::string const &value, std::string const &ending) { + if (ending.size() > value.size()) return false; + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); +} + +std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) { + os << var_desc.Name(); + os << "[" << var_desc.GetType() << "]"; + os << "[" << var_desc.GetDataType() << "]"; + os << "{"; + for (auto &i : var_desc.GetShape()) { + os << i << ","; + } + os << "}"; + return os; +} + +std::string to_string(const std::string &type, + const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars, + const framework::AttributeMap &attrs) { + std::stringstream ss; + ss << type << " "; + for (auto ¶m_name : in_vars) { + for (auto &var : param_name.second) { + ss << param_name.first << ":(" << var->Desc() << ") "; + } + } + for (auto ¶m_name : out_vars) { + for (auto &var : param_name.second) { + ss << param_name.first << ":(" << var->Desc() << ") "; + } + } + return ss.str(); +} + +framework::OpDesc CreateOpDesc(const std::string &type, + const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars, + const framework::AttributeMap &attrs) { + framework::VariableNameMap inputs; + for (auto ¶m_name : in_vars) { + for (auto &var : param_name.second) { + inputs[param_name.first].emplace_back(var->Name()); + } + } + framework::VariableNameMap outputs; + for (auto ¶m_name : out_vars) { + for (auto &var : param_name.second) { + outputs[param_name.first].emplace_back(var->Name()); + } + } + return framework::OpDesc(type, inputs, outputs, attrs); +} + +void InferShapeAndVarType(const std::string &type, + const VariableHandleMap &in_vars, + VariableHandleMap *out_vars, + const framework::AttributeMap &attrs) { + framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs); + + // Create a temporary block for compile-time + framework::ProgramDesc program_desc; + framework::BlockDesc *block_desc = program_desc.MutableBlock(0); + PADDLE_ENFORCE(block_desc); + + for (auto ¶m_name : in_vars) { + for (auto &var : param_name.second) { + *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto(); + } + } + for (auto ¶m_name : *out_vars) { + for (auto &var : param_name.second) { + *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto(); + } + } + + LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs); + op_desc.InferShape(*block_desc); + op_desc.InferVarType(block_desc); + for (auto ¶m_name : *out_vars) { + for (auto &var : param_name.second) { + *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto(); + } + } + LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs); +} + +void Tape::AddOp(const std::string &type, + const VariableHandleMap &in_vars, + VariableHandleMap out_vars, + const framework::AttributeMap &attrs) { + InferShapeAndVarType(type, in_vars, &out_vars, attrs); + tape_.emplace_back(type, in_vars, out_vars, attrs); +} + +// Temporary Scope for Operator::Run() +class ScopeWrapper : public framework::Scope { + public: + ScopeWrapper(const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars) { + for (auto &v : in_vars) { + for (auto &vv : v.second) { + if (!vars_.count(vv->Name())) { + vars_[vv->Name()].reset(vv->Var()); + } + } + } + for (auto &v : out_vars) { + for (auto &vv : v.second) { + if (!vars_.count(vv->Name())) { + vars_[vv->Name()].reset(vv->Var()); + } + } + } + } + + ~ScopeWrapper() { + for (auto &pair : vars_) { + pair.second.release(); + } + } +}; + +void Tape::Forward() { + LOG(INFO) << "Starting forward -------------------------"; + PADDLE_ENFORCE(!has_been_backwarded_); + while (current_position_ < tape_.size()) { + OpHandle &op = tape_[current_position_]; + + // Create Output Tensor, this is only necessary for OpWithKernel + for (auto ¶m2var : op.outputs_) { + for (auto &var : param2var.second) { + var->InitializeVariable(); + } + } + + framework::OpDesc op_desc = + CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_); + ScopeWrapper scope(op.inputs_, op.outputs_); + framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace()); + current_position_++; + } + + LOG(INFO) << "Finishing forward -------------------------"; +} + +void Tape::Backward(VariableHandle target) { + PADDLE_ENFORCE(!has_been_backwarded_); + + Forward(); + + // TODO(tonyyang-svail): check output of last op is target + backward_tape_.reset(new Tape()); + + framework::AttributeMap attrs; + + // FIXME(tonyyang-svail): Need to infer_data_type + attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{1}; + attrs["value"] = 1.0f; + backward_tape_->AddOp( + "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs); + + for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) { + framework::OpDesc op_desc = + CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_); + std::unordered_map grad_to_var; + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, {}, &grad_to_var, {}); + + for (auto &op_desc : grad_op_descs) { + std::unordered_map name2var; + for (auto ¶m2vars : it->inputs_) { + for (auto &a : param2vars.second) { + name2var[a->Name()] = a; + } + } + for (auto ¶m2vars : it->outputs_) { + for (auto &a : param2vars.second) { + name2var[a->Name()] = a; + } + } + + VariableHandleMap in_vars; + VariableHandleMap out_vars; + std::map + loop_over{{&op_desc->Inputs(), &in_vars}, + {&op_desc->Outputs(), &out_vars}}; + for (auto &each : loop_over) { + auto &vmp = *each.first; + auto &vhm = *each.second; + for (auto &p2a : vmp) { + for (auto &argu : p2a.second) { + if (name2var.count(argu)) { + vhm[p2a.first].push_back(name2var[argu]); + } else { + PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix), + argu.c_str()); + std::string name = argu.substr( + 0, argu.size() - std::strlen(framework::kGradVarSuffix)); + PADDLE_ENFORCE(name2var.count(name), name.c_str()); + vhm[p2a.first].push_back(name2var[name]->Grad()); + } + } + } + } + + backward_tape_->AddOp( + op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap()); + } + + // TODO(tonyyang-svail): how to fill empty grad? + // TODO(tonyyang-svail): Sum var grad is necessary + } + + backward_tape_->Forward(); + has_been_backwarded_ = true; +} + +Tape &get_global_tape() { + static Tape T; + return T; +} + +void reset_global_tape() { get_global_tape() = Tape(); } +} +} diff --git a/paddle/contrib/tape/tape.h b/paddle/contrib/tape/tape.h new file mode 100644 index 0000000000..ed79de17a7 --- /dev/null +++ b/paddle/contrib/tape/tape.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/contrib/tape/variable.h" + +namespace paddle { +namespace tape { + +using VariableHandleMap = std::map>; + +struct OpHandle { + OpHandle(const std::string &type, + const VariableHandleMap &in_vars, + const VariableHandleMap &out_vars, + const framework::AttributeMap &attrs) + : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {} + + std::string type_; + VariableHandleMap inputs_; + VariableHandleMap outputs_; + framework::AttributeMap attrs_; +}; + +class Tape { + public: + void AddOp(const std::string &type, + const VariableHandleMap &in_vars, + VariableHandleMap out_vars, + const framework::AttributeMap &attrs); + void Forward(); + void Backward(VariableHandle target); + + bool HasBeenBackwarded() { return has_been_backwarded_; } + + private: + bool has_been_backwarded_ = false; + size_t current_position_ = 0; + + std::vector tape_; + std::shared_ptr backward_tape_; +}; + +Tape &get_global_tape(); + +void reset_global_tape(); +} +} diff --git a/paddle/contrib/tape/test_tape.cc b/paddle/contrib/tape/test_tape.cc new file mode 100644 index 0000000000..e9bfd21a71 --- /dev/null +++ b/paddle/contrib/tape/test_tape.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/contrib/tape/function.h" + +using namespace paddle::tape; + +TEST(Tape, TestMLP) { + LOG(INFO) << "TestMLP"; + Linear linear1(3, 3, "relu"); + Linear linear2(3, 3, "relu"); + Mean mean; + + SGD sgd(0.001); + + std::string initializer = "fill_constant"; + paddle::framework::AttributeMap attrs; + attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32; + attrs["shape"] = std::vector{3, 3}; + attrs["value"] = 1.0f; + Fill filler(initializer, attrs); + + for (int i = 0; i < 2; ++i) { + reset_global_tape(); + + VariableHandle input(new Variable("input")); + filler(input); + + auto loss = mean(linear2(linear1(input))); + + get_global_tape().Backward(loss); + + for (auto w : linear1.Params()) { + sgd(w); + } + for (auto w : linear2.Params()) { + sgd(w); + } + } +} + +int main(int argc, char** argv) { + std::vector places; + places.emplace_back(paddle::platform::CPUPlace()); + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/contrib/tape/variable.cc b/paddle/contrib/tape/variable.cc new file mode 100644 index 0000000000..5ec1612909 --- /dev/null +++ b/paddle/contrib/tape/variable.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/contrib/tape/variable.h" + +namespace paddle { +namespace tape { + +void Variable::InitializeVariable() { + LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType(); + framework::proto::VarType::Type var_type = desc_.GetType(); + if (var_type == framework::proto::VarType::LOD_TENSOR) { + var_.GetMutable(); + } else if (var_type == framework::proto::VarType::SELECTED_ROWS) { + var_.GetMutable(); + } else { + PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]", + var_type); + } +} +} +} diff --git a/paddle/contrib/tape/variable.h b/paddle/contrib/tape/variable.h new file mode 100644 index 0000000000..35c328e69c --- /dev/null +++ b/paddle/contrib/tape/variable.h @@ -0,0 +1,85 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include "paddle/fluid/framework/operator.h" // framework::kGradVarSuffix +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace tape { + +class Variable; +using VariableHandle = std::shared_ptr; + +/* + * Combination of + * framework::VarDesc desc_; + * framework::Variable var_; + */ +class Variable { + public: + Variable(const std::string pre_fix) + : desc_(pre_fix + std::to_string(count())) {} + + Variable(const std::string pre_fix, bool is_grad) + : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix + : std::to_string(count()))) {} + + ~Variable() { LOG(INFO) << "Deleting " << Name(); } + + // Instantiate LoDTensor/SelectedRow + void InitializeVariable(); + + VariableHandle Grad() { + if (grad_.expired()) { + VariableHandle new_grad(new Variable(desc_.Name(), true)); + grad_ = new_grad; + return new_grad; + } else { + return VariableHandle(grad_); + } + } + + // Stochastic Gradient Descent with Momentum + // VariableHandle Momentum (); + + // void init(const std::string& initializer, + // const framework::AttributeMap& attrs); + + // void value() {}; + + const framework::VarDesc& Desc() const { return desc_; } + framework::VarDesc* MutableDesc() { return &desc_; } + + // TODO(tonyyang-svail): No need to expose name + std::string Name() const { return desc_.Name(); } + + framework::Variable* Var() { return &var_; } + + private: + int count() { + static int counter = 0; + return counter++; + } + + framework::VarDesc desc_; + framework::Variable var_; + + std::weak_ptr grad_; +}; +} +} diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 4271e4c1bb..6286dda4a5 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -83,8 +83,13 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope -framework_proto glog lod_rank_table feed_fetch_method) +if(WITH_DISTRIBUTE) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +else() + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) +endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index 542c4a1728..304b221e7e 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -19,7 +19,7 @@ namespace paddle { namespace framework { namespace details { -class SSAGraph; +struct SSAGraph; class SSAGraghBuilderWithChecker : public SSAGraphBuilder { public: diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index d4d6c34108..e15232a77b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -20,6 +20,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/detail/grpc_client.h" +#endif #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -44,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { Executor::Executor(const platform::Place& place) : place_(place) {} +#ifdef PADDLE_WITH_DISTRIBUTE +void Executor::Complete() { + ::paddle::operators::detail::RPCClient::GetInstance< + ::paddle::operators::detail::GRPCClient>() + ->SendComplete(); +} +#endif + void InitializeVariable(Variable* var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); @@ -319,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } for (auto& op : ctx->ops_) { - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); + // NOTE! Please do not delete this line, it's usefull because the debug + // string before and after op.run are different, after run the output + // will have right shape which is usefull for debug. + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index e6f9c3d31c..67a0761dac 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -44,6 +44,13 @@ class Executor { explicit Executor(const platform::Place& place); +#ifdef PADDLE_WITH_DISTRIBUTE + /* + * Sending signal to pserver to mark current trainer stop. + */ + void Complete(); +#endif + /* @Brief * Runtime evaluation of the given ProgramDesc under certain Scope * diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c633a2f847..122ee1dab3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name, } } +static int GetRowSize(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return -1; + } + + if (var->IsType()) { + return var->Get().rows().size(); + } + + return -1; +} + static LoD GetLoD(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); auto default_lod = LoD({{}}); @@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { + VLOG(10) << "- " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("Cannot run operator on place %s", place); @@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #endif } RunImpl(scope, place); + VLOG(10) << "+ " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < input.second.size(); ++i) { ss << input.second[i]; if (scope) { + int row_size = GetRowSize(*scope, input.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } @@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < output.second.size(); ++i) { ss << output.second[i]; if (scope) { + int row_size = GetRowSize(*scope, output.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, output.second[i], true) << "]"; ss << "(" << GetLoD(*scope, output.second[i]) << ")"; } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ac4d1f58a5..9406c6155d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs( auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { #ifdef PADDLE_WITH_CUDA + std::vector buffers; size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; void *buffer; @@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs( t->Resize(dims); buffer = t->mutable_data(place, main_tensor.type()); } - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); + buffers.push_back(buffer); } - member_->nccl_ctxs_->WaitAll(); + + PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + "variables' buffer size to bcast NOT equal to places"); + { + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + member_->nccl_ctxs_->WaitAll(); + } + #else PADDLE_THROW("Not compiled with CUDA"); #endif diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 3a413941df..64d4ceab62 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -35,14 +35,15 @@ class ReaderBase { class DecoratedReader : public ReaderBase { public: - explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) { + explicit DecoratedReader(const std::shared_ptr& reader) + : ReaderBase(), reader_(reader) { PADDLE_ENFORCE_NOT_NULL(reader_); } void ReInit() override { reader_->ReInit(); } protected: - ReaderBase* reader_; + std::shared_ptr reader_; }; class FileReader : public ReaderBase { @@ -64,7 +65,7 @@ class ReaderHolder { public: void Reset(ReaderBase* reader) { reader_.reset(reader); } - ReaderBase* Get() const { return reader_.get(); } + std::shared_ptr Get() const { return reader_; } void ReadNext(std::vector* out) { PADDLE_ENFORCE_NOT_NULL(reader_); @@ -76,7 +77,7 @@ class ReaderHolder { } private: - std::unique_ptr reader_; + std::shared_ptr reader_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 98d103d867..95b4f7c5f6 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -81,6 +81,9 @@ class Scope { // Rename variable to a new name and return the new name std::string Rename(const std::string& origin_name) const; + protected: + mutable std::unordered_map> vars_; + private: // Call Scope::NewScope for a sub-scope. explicit Scope(Scope const* parent) : parent_(parent) {} @@ -93,8 +96,6 @@ class Scope { // Caller doesn't own the returned Variable. Variable* FindVarLocally(const std::string& name) const; - mutable std::unordered_map> vars_; - // Scope in `kids_` are owned by this class. mutable std::list kids_; Scope const* parent_{nullptr}; diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 65db7c7b50..6b03ac7119 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -20,16 +20,20 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_bool(init_p2p, false, "Whether to init p2p."); +DEFINE_int32(math_num_threads, 1, + "Number of threads used to run math functions."); namespace paddle { namespace inference { void Init(const std::vector argv) { framework::InitGflags(argv); + operators::math::SetNumThreads(FLAGS_math_num_threads); // init devices std::vector devices; std::string token; diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 0e4a56d4a4..8206cc9890 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -19,10 +19,17 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; +using batch_norm_bwd = mkldnn::batch_normalization_backward; +using batch_norm_fwd = mkldnn::batch_normalization_forward; +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNMemDesc; -using mkldnn::memory; +using platform::to_void_cast; template using EigenArrayMap = @@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) { mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } -template -inline void *cast_const_to_void(const T *t) { - return static_cast(const_cast(t)); -} } // namespace template class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - const float epsilon = ctx.Attr("epsilon"); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); @@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { const auto *scale = ctx.Input("Scale"); const auto *shift = ctx.Input("Bias"); - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && + x->format() != memory::format::format_undef, + "Wrong layout/format set for Input x tensor"); + + const T *x_data = x->data(); + const T *mean_data = mean->data(); + const T *variance_data = variance->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + T *mean_out_data = mean_out->mutable_data(ctx.GetPlace()); + T *variance_out_data = variance_out->mutable_data(ctx.GetPlace()); + T *batch_mean_data = nullptr; + T *batch_variance_data = nullptr; if (!is_test) { - batch_mean->mutable_data(ctx.GetPlace()); - batch_variance->mutable_data(ctx.GetPlace()); + batch_mean_data = batch_mean->mutable_data(ctx.GetPlace()); + batch_variance_data = batch_variance->mutable_data(ctx.GetPlace()); } auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring : mkldnn::prop_kind::forward_training; - auto dims = paddle::framework::vectorize2int(x->dims()); - - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - - auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine}; - - auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data())}; - auto dst = mkldnn::memory{dst_pd, y->data()}; + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + const unsigned int ic = scale_tz[0]; unsigned flags = mkldnn::use_scale_shift; if (is_test) flags |= mkldnn::use_global_stats; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); + + // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; - auto batch_norm_fwd_desc = - bn_fwd_types::op_desc{propagation, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ + propagation, src_memory.get_primitive_desc().desc(), epsilon, flags}; + std::shared_ptr batch_norm_fwd_pd = + std::shared_ptr( + new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc, + mkldnn_engine)); - const unsigned int ic = dims[1]; + // Save the pd to be used in backward pass + const std::string key = ctx.op().Output("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { copy_to_weights(scale->data(), scale->data() + ic, shift->data(), shift->data() + ic, &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // crate mkldnn memory for weights(scale/shift) + auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(), + scaleshift_data.data()); - if (is_test) { - auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(mean->data())}; + // create mkldnn memory for output y tensor + auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data); + if (is_test) { + // create mkldnn memory for stats (as input) + auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(), + to_void_cast(mean_data)); auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(variance->data())}; + memory(batch_norm_fwd_pd->variance_primitive_desc(), + to_void_cast(variance_data)); run_batch_norm_op( - batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory, + *batch_norm_fwd_pd, src_memory, + (const mkldnn::primitive::at &)mean_memory, (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, - dst); + dst_memory); } else { + // create mkldnn memory for stats (as output) auto mean_memory = - mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; - - auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; + memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data); + auto variance_memory = memory( + batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data); - run_batch_norm_op(batch_norm_fwd_pd, src, - scaleshift_memory, dst, + run_batch_norm_op(*batch_norm_fwd_pd, src_memory, + scaleshift_memory, dst_memory, mean_memory, variance_memory); } if (!is_test) { - const unsigned int in = dims[0]; - const unsigned int sample_size = x->numel() / in / ic; - - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - batch_mean->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap saved_variance_e( - batch_variance->mutable_data(ctx.GetPlace()), ic); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - const unsigned int x_arr_size = in * ic; - ConstEigenArrayMap x_arr(x->data(), sample_size, x_arr_size); - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_mean_e(nc % ic) += x_arr.col(nc).sum(); - } - saved_mean_e /= in * sample_size; - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_variance_e(nc % ic) += - (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm(); - } - saved_variance_e /= in * sample_size; - - ConstEigenVectorArrayMap mean_arr{mean->data(), ic}; - ConstEigenVectorArrayMap variance_arr{variance->data(), ic}; - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), ic); + // mkldnn only compute stats for current batch + // so we need compute momentum stats via Eigen lib + EigenVectorArrayMap batch_mean_e(batch_mean_data, ic); + EigenVectorArrayMap batch_variance_e(batch_variance_data, ic); + ConstEigenVectorArrayMap mean_e(mean_data, ic); + ConstEigenVectorArrayMap variance_e{variance_data, ic}; + + EigenVectorArrayMap running_mean_e(mean_out_data, ic); + EigenVectorArrayMap running_variance_e(variance_out_data, ic); auto one_minus_momentum = 1. - momentum; - running_mean_arr = - mean_arr * momentum + saved_mean_e * one_minus_momentum; - running_var_arr = - variance_arr * momentum + saved_variance_e * one_minus_momentum; + running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum; + running_variance_e = + variance_e * momentum + batch_variance_e * one_minus_momentum; } + + y->set_layout(DataLayout::kMKLDNN); + y->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); } }; @@ -217,11 +212,6 @@ template class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - auto &dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); @@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); - diff_x->mutable_data(ctx.GetPlace()); - diff_scale->mutable_data(ctx.GetPlace()); - diff_shift->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN && + diff_y->format() != memory::format::format_undef, + "Wrong layout/format set for Input diff_y tensor"); + + const T *x_data = x->data(); + const T *diff_y_data = diff_y->data(); + const T *batch_mean_data = batch_mean->data(); + const T *batch_variance_data = batch_variance->data(); + const T *scale_data = scale->data(); + const T *shift_data = shift->data(); + T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); + T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); + T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); + + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto diff_src_tz = src_tz; + auto dst_tz = src_tz; + auto diff_dst_tz = dst_tz; + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + + const unsigned int ic = scale_tz[0]; + + // Retrieve bn_fwd_pd from device context + const std::string key = ctx.op().Input("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + auto batch_norm_fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_fwd_pd)); + PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, + "Fail to find batch_norm_fwd_pd in device context"); - auto dims = paddle::framework::vectorize2int(x->dims()); - unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats; + using bn_bwd_types = bn_type_traits; - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + // create mkldnn memory from input diff_y tensor + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()}, + mkldnn_engine}, + to_void_cast(diff_y_data)); - using bn_bwd_types = bn_type_traits; - using bn_fwd_types = bn_type_traits; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); - auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ - mkldnn::prop_kind::forward_training, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + // for diff_dst, try to use same format as dst in forward pass + auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); + auto diff_dst_md = diff_dst_pd.desc(); + // create primitive descriptor for batch norm backward + unsigned flags = mkldnn::use_scale_shift; auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ - mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags}; + mkldnn::prop_kind::backward, diff_dst_md, + src_memory.get_primitive_desc().desc(), epsilon, flags}; auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ - batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd}; - - auto src = mkldnn::memory{{src_md, mkldnn_engine}, - cast_const_to_void(x->data())}; - - auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; - - auto variance = - mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; - - auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine}, - cast_const_to_void(diff_y->data())}; + batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; + + // reorder user_diff_dst if it's not in preferred format + auto diff_dst_memory = user_diff_dst_memory; + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; + if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = memory(diff_dst_pd); + reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory); + is_diff_dst_reordered = true; + } - const unsigned int ic = dims[1]; + // create mkldnn memory for input tensors (src/mean/variance) + auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(), + to_void_cast(batch_mean_data)); + auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(), + to_void_cast(batch_variance_data)); + // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; std::vector scaleshift_data; scaleshift_data.reserve(scaleshift_size); - copy_to_weights(scale->data(), scale->data() + ic, shift->data(), - shift->data() + ic, &scaleshift_data); + copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic, + &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // create mkldnn memory for input tensors (scale/shift) + auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(), + scaleshift_data.data()); + // create mkldnn memory for output diff weights (combined scale/shift) std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); - copy_to_weights(diff_scale->data(), diff_scale->data() + ic, - diff_shift->data(), diff_shift->data() + ic, - &diff_scaleshift_data); - auto diff_scaleshift_memory = - mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(), - diff_scaleshift_data.data()}; - - auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine}, - static_cast(diff_x->data())}; - - run_batch_norm_op( - batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory, - diff_src, diff_scaleshift_memory); - + memory(batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()); + + // here assume diff_src is in the same format of src + auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data); + + // finally create batch_norm backward primitive + auto batch_norm_bwd_prim = + batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory, + variance_memory, diff_dst_memory, scaleshift_memory, + diff_src_memory, diff_scaleshift_memory); + + // execute optional reorder and batch_norm backward primitive + std::vector pipeline; + if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst); + pipeline.push_back(batch_norm_bwd_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // copy back diff sacle/shift to output tensors (diff scale/shift) + diff_scaleshift_data.resize(scaleshift_size); auto it = std::begin(diff_scaleshift_data); - std::copy(it, std::next(it, ic), diff_scale->data()); + std::copy(it, std::next(it, ic), diff_scale_data); std::copy(std::next(it, ic), std::end(diff_scaleshift_data), - diff_shift->data()); + diff_shift_data); + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc() + .desc() + .data.format); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNOpKernel); -REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 92fbb9adaf..625ca2d7c4 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel { ctx.Input("Variance")->type()), "Variance input should be of float type"); - framework::LibraryType library_{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; framework::DataLayout layout = framework::DataLayout::kAnyLayout; - #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; + library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; } #endif + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library_); + library); } }; @@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_THROW("can't find Y@GRAD"); } - framework::LibraryType library_{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; - layout_ = framework::DataLayout::kMKLDNN; + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif + return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + layout, library); } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 63d371310d..6b06913d1c 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -18,6 +18,17 @@ namespace paddle { namespace operators { +using conv_bwd_data = mkldnn::convolution_backward_data; +using conv_bwd_weights = mkldnn::convolution_backward_weights; +using conv_fwd = mkldnn::convolution_forward; +using framework::DataLayout; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; +using platform::GetMKLDNNFormat; + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + // Get unique name for index + const std::string key = ctx.op().Output("Output"); + const std::string key_conv_pd = key + "@conv_pd"; + auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); - // Get an unique name from "argument" name of "Output" variable - // This name will be used as key when saving info into device context - const std::string key = ctx.op().Output("Output"); - const std::string key_conv_pd = key + "@conv_pd"; + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); @@ -63,60 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); - auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); - - std::shared_ptr conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine); - - // save conv_pd into global device context to be referred in backward path - dev_ctx.SetBlob(key_conv_pd, conv_pd); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + + // create a conv primitive descriptor and save it for usage in backward + std::shared_ptr conv_pd = ConvFwdPrimitiveDesc( + src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_pd->src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_pd->src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_pd->weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_pd->weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + // create memory primitive for conv dst + auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); // create convolution op primitive - auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory, - weights_memory, dst_memory); + auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); // push primitive to stream and wait until it's executed - std::vector pipeline{conv_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_weights_reordered) pipeline.push_back(reorder_weights); + pipeline.push_back(conv_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(dst_memory)); } private: - std::unique_ptr - ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src, - const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& dst, - const std::vector& strides, - const std::vector& paddings, - const mkldnn::engine& engine) const { - mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; - mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; - - auto conv_desc = mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, - dst, stride_dims, padding_dims, padding_dims, - mkldnn::padding_kind::zero); - - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); - - return std::unique_ptr( - p_conv_pd); + std::unique_ptr ConvFwdPrimitiveDesc( + const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, const mkldnn::engine& engine) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto conv_desc = + conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct, + src, weights, dst, stride_dims, padding_dims, + padding_dims, mkldnn::padding_kind::zero); + + auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine); + + return std::unique_ptr(p_conv_pd); } }; @@ -139,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN && + output->format() != memory::format::format_undef, + "Wrong layout/format set for Output tensor"); + PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN && + output_grad->format() != memory::format::format_undef, + "Wrong layout/format set for output_grad tensor"); + if (!input_grad && !filter_grad) return; // Get an unique name from "argument" name of "Output" variable @@ -167,108 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto diff_src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - // create memory - auto diff_dst_memory = mkldnn::memory( - {diff_weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(output_grad_data))); + // create mkldnn memory from input tensors (input/weights/output_grad) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + auto user_diff_dst_memory = + memory({{{dst_tz}, memory::data_type::f32, output_grad->format()}, + mkldnn_engine}, + to_void_cast(output_grad_data)); + + /* create memory descriptor for conv backward without specified format + * ('any') which lets a primitive (conv backward in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + // Retrieve conv_pd from device context - auto conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); + auto conv_pd = std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); PADDLE_ENFORCE(conv_pd != nullptr, "Fail to find conv_pd in device context"); // create backward conv primitive for weights if (filter_grad) { - // create primitive descriptor - mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd = - ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md, - strides, paddings, *conv_pd, - mkldnn_engine); - - // create memory + // create backward convolution primitive descriptor + auto conv_bwd_weights_desc = conv_bwd_weights::desc( + mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc( + conv_bwd_weights_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_bwd_weights_pd.src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + + auto diff_dst_memory_4filter = user_diff_dst_memory; + primitive reorder_diff_dst_4filter; + bool is_diff_dst_reordered_4filter = false; + if (memory::primitive_desc( + conv_bwd_weights_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4filter = + memory(conv_bwd_weights_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4filter = + reorder(user_diff_dst_memory, diff_dst_memory_4filter); + is_diff_dst_reordered_4filter = true; + } + + // create mkldnn memory for output (i.e. diff weights) auto diff_weights_memory = - mkldnn::memory({diff_weights_md, mkldnn_engine}, - reinterpret_cast(filter_grad_data)); - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); + memory(conv_bwd_weights_pd.diff_weights_primitive_desc(), + reinterpret_cast(filter_grad_data)); // create backward conv primitive for weights - auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( - conv_bwd_weights_pd, src_memory, diff_dst_memory, - diff_weights_memory); + auto conv_bwd_weights_prim = + conv_bwd_weights(conv_bwd_weights_pd, src_memory, + diff_dst_memory_4filter, diff_weights_memory); // push primitive and execute it - std::vector pipeline{conv_bwd_weights_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_diff_dst_reordered_4filter) + pipeline.push_back(reorder_diff_dst_4filter); + pipeline.push_back(conv_bwd_weights_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory)); } if (input_grad) { - // create primitive descriptor - mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd = - ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md, - strides, paddings, *conv_pd, mkldnn_engine); - - // create memory - auto diff_src_memory = mkldnn::memory( - {diff_src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_grad_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); + // create backward convolution primitive descriptor + auto conv_bwd_data_desc = conv_bwd_data::desc( + mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_data_pd = conv_bwd_data::primitive_desc( + conv_bwd_data_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + auto diff_dst_memory_4data = user_diff_dst_memory; + primitive reorder_diff_dst_4data; + bool is_diff_dst_reordered_4data = false; + if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4data = + memory(conv_bwd_data_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4data = + reorder(user_diff_dst_memory, diff_dst_memory_4data); + is_diff_dst_reordered_4data = true; + } + + // create mkldnn memory for output (i.e. diff src) + auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(), + reinterpret_cast(input_grad_data)); // create backward conv primitive for data - auto conv_bwd_data_prim = mkldnn::convolution_backward_data( - conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory); + auto conv_bwd_data_prim = + conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory, + diff_src_memory); - // push primitive to stream and wait until it's executed - std::vector pipeline{conv_bwd_data_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + // push primitive and execute it + std::vector pipeline; + if (is_weights_reordered) pipeline.push_back(reorder_weights); + if (is_diff_dst_reordered_4data) + pipeline.push_back(reorder_diff_dst_4data); + pipeline.push_back(conv_bwd_data_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + input_grad->set_layout(DataLayout::kMKLDNN); + input_grad->set_format(GetMKLDNNFormat(diff_src_memory)); } } // Compute() - - private: - mkldnn::convolution_backward_weights::primitive_desc - ConvBwdWeightsPrimitiveDesc( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( - mkldnn::convolution_direct, src, diff_weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc( - conv_bwd_weights_desc, engine, conv_pd); - } - - mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( - mkldnn::convolution_direct, diff_src, weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc, - engine, conv_pd); - } }; } // namespace operators diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 27f1313116..37153d5843 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kPlain}; - - std::string data_format = ctx.Attr("data_format"); // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 6b8373b150..02ffe3651e 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -34,6 +34,12 @@ void GRPCClient::InitEventLoop() { client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); } +void GRPCClient::SendComplete() { + for (auto& it : channels_) { + this->AsyncSendComplete(it.first); + } +} + GRPCClient::~GRPCClient() { Wait(); cq_.Shutdown(); @@ -210,6 +216,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, req_count_++; } +void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { + const auto ch = GetChannel(ep); + + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + s->Prepare(time_out); + + sendrecv::VariableMessage req; + req.set_varname(COMPLETE_MESSAGE); + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; +} + void GRPCClient::Wait() { std::unique_lock lk(sync_mutex_); sync_cond_.wait(lk, [this] { return req_count_ == 0; }); diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h index 8db73f875e..44000c028b 100644 --- a/paddle/fluid/operators/detail/grpc_client.h +++ b/paddle/fluid/operators/detail/grpc_client.h @@ -195,6 +195,8 @@ class GRPCClient : public RPCClient { void Wait() override; + void SendComplete() override; + protected: void InitImpl() override; @@ -204,6 +206,9 @@ class GRPCClient : public RPCClient { void Proceed(); + void AsyncSendComplete(const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out); + std::shared_ptr GetChannel(const std::string& ep); private: diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 18651544a1..5a87258901 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -162,16 +162,19 @@ class RequestPrefetch final : public RequestBase { void Process() override { // prefetch process... - std::string varname = request_->OutVarname(); - VLOG(3) << "RequestPrefetch " << varname; + std::string in_var_name = request_->Varname(); + std::string out_var_name = request_->OutVarname(); + VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name + << " out_var_name: " << out_var_name; auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(varname); - framework::Variable* outvar = nullptr; + auto invar = scope->FindVar(in_var_name); + // out var must be created in local scope! + framework::Variable* outvar = scope->Var(out_var_name); - request_handler_->Handle(varname, scope, invar, &outvar); + request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); - SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), + SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), &reply_); Finish(reply_, &responder_); } @@ -287,7 +290,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, } else if (rpc_name == kRequestPrefetch) { b = new RequestPrefetch(&service_, cq.get(), handler, req_id); } else { - PADDLE_ENFORCE(false, "not surpported rpc"); + PADDLE_ENFORCE(false, "not supported rpc"); } reqs[req_id] = b; diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h index fa979024e3..a2d08747d5 100644 --- a/paddle/fluid/operators/detail/request_handler.h +++ b/paddle/fluid/operators/detail/request_handler.h @@ -40,6 +40,7 @@ constexpr char kRequestPrefetch[] = "RequestPrefetch"; #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" +#define COMPLETE_MESSAGE "COMPLETE@RECV" class RPCServer; @@ -60,9 +61,12 @@ class RequestHandler { void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } void SetProgram(framework::ProgramDesc* program) { program_ = program; } void SetExecutor(framework::Executor* executor) { executor_ = executor; } + + // Used for dist lookup table prefetch void SetPrefetchPreparedCtx( - std::unique_ptr prepared) { - prefetch_ctx_.reset(prepared.release()); + std::unordered_map< + std::string, std::shared_ptr>* g) { + prefetch_var_name_to_prepared_ctx_ = g; } // Used for async. @@ -78,9 +82,6 @@ class RequestHandler { bool sync_mode() { return sync_mode_; } framework::Scope* scope() { return scope_; } const platform::DeviceContext* dev_ctx() { return dev_ctx_; } - framework::ExecutorPrepareContext* prefetch_ctx() { - return prefetch_ctx_.get(); - } framework::ProgramDesc* program() { return program_; } framework::Executor* executor() { return executor_; } @@ -99,8 +100,8 @@ class RequestHandler { // *request_handler_->dev_ctx(), &reply_); // } virtual bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, - framework::Variable** outvar) = 0; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") = 0; protected: const bool sync_mode_; @@ -109,12 +110,17 @@ class RequestHandler { framework::Executor* executor_; framework::Scope* scope_; framework::ProgramDesc* program_; - std::unique_ptr prefetch_ctx_; + + // used for distribute lookup table prefetch + std::unordered_map>* + prefetch_var_name_to_prepared_ctx_; // Used for async. std::unordered_map>* grad_to_prepared_ctx_; + RPCServer* rpc_server_; }; diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc index 5f1a346e93..7425bee798 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/detail/request_handler_impl.cc @@ -30,7 +30,8 @@ namespace detail { bool RequestSendHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestSendHandler:" << varname; // Async @@ -49,6 +50,9 @@ bool RequestSendHandler::Handle(const std::string& varname, if (varname == BATCH_BARRIER_MESSAGE) { VLOG(3) << "sync: recv batch barrier message"; rpc_server_->IncreaseBatchBarrier(kRequestSend); + } else if (varname == COMPLETE_MESSAGE) { + VLOG(3) << "sync: recv complete message"; + rpc_server_->DecreaseClientNum(); } else { VLOG(3) << "sync: received var_name: " << varname; if (sync_mode_) { @@ -79,7 +83,8 @@ void RequestSendHandler::ResetSparseVarRecorder() { bool RequestGetHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestGetHandler:" << varname; if (varname != FETCH_BARRIER_MESSAGE) { @@ -102,13 +107,14 @@ bool RequestGetHandler::Handle(const std::string& varname, bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, - framework::Variable** outvar) { + framework::Variable** outvar, + const std::string& out_var_name) { VLOG(4) << "RequestPrefetchHandler " << varname; - auto var_desc = program_->Block(0).FindVar(varname); - *outvar = scope->FindVar(varname); + auto var_desc = program_->Block(0).FindVar(out_var_name); InitializeVariable(*outvar, var_desc->GetType()); - executor_->RunPreparedContext(prefetch_ctx_.get(), scope); + executor_->RunPreparedContext( + (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); return true; } diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h index c392267cfa..3f77c09a95 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.h +++ b/paddle/fluid/operators/detail/request_handler_impl.h @@ -39,7 +39,8 @@ class RequestSendHandler final : public RequestHandler { explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestSendHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; void ResetSparseVarRecorder(); private: @@ -52,7 +53,8 @@ class RequestGetHandler final : public RequestHandler { explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestGetHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; }; class RequestPrefetchHandler final : public RequestHandler { @@ -60,7 +62,8 @@ class RequestPrefetchHandler final : public RequestHandler { explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestPrefetchHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar) override; + framework::Variable* var, framework::Variable** outvar, + const std::string& out_var_name = "") override; }; } // namespace detail diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h index 7e76ac0348..47c6ffb4fd 100644 --- a/paddle/fluid/operators/detail/rpc_client.h +++ b/paddle/fluid/operators/detail/rpc_client.h @@ -53,6 +53,11 @@ class RPCClient { virtual void AsyncSendFetchBarrier(const std::string& ep, int64_t time_out = rpc_time_out) = 0; + // SendComplete tells all the server that current trainer have no more data + // to train, so that the pserver can reduce it's barrier count, and continue + // to train with other trainers. + virtual void SendComplete() = 0; + virtual void Wait() = 0; static constexpr int64_t rpc_time_out = 120 * 1000; diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc index 448763372a..cd0fe96e23 100644 --- a/paddle/fluid/operators/detail/rpc_server.cc +++ b/paddle/fluid/operators/detail/rpc_server.cc @@ -43,7 +43,7 @@ void RPCServer::SavePort() const { void RPCServer::WaitBarrier(const std::string& rpc_name) { std::unique_lock lock(this->mutex_); - barrier_cond_.wait(lock, [=] { + barrier_cond_.wait(lock, [this, &rpc_name] { return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load()); }); @@ -53,19 +53,23 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) { void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; int b = 0; - { - std::unique_lock lock(mutex_); - b = ++barrier_counter_[rpc_name]; - } - - VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name - << ", barrier_count:" << b << ", fan_in" << client_num_; - + std::unique_lock lock(mutex_); + b = ++barrier_counter_[rpc_name]; if (b >= client_num_) { + lock.unlock(); barrier_cond_.notify_all(); + lock.lock(); } } +void RPCServer::DecreaseClientNum() { + { + std::unique_lock lock(mutex_); + client_num_--; + } + barrier_cond_.notify_all(); +} + void RPCServer::ResetBarrierCounter() { VLOG(3) << "RPCServer ResetBarrierCounter "; std::unique_lock lock(mutex_); diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h index f809c13c72..2e3342428c 100644 --- a/paddle/fluid/operators/detail/rpc_server.h +++ b/paddle/fluid/operators/detail/rpc_server.h @@ -60,7 +60,7 @@ class RPCServer { void SetCond(const std::string& rpc_name); void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); - + void DecreaseClientNum(); void ResetBarrierCounter(); protected: @@ -79,8 +79,7 @@ class RPCServer { std::string bind_address_; std::atomic exit_flag_; int selected_port_; - - const int client_num_; + int client_num_; std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; diff --git a/paddle/fluid/operators/detail/rpc_server_test.cc b/paddle/fluid/operators/detail/rpc_server_test.cc index f49274a7b5..463a7b80cf 100644 --- a/paddle/fluid/operators/detail/rpc_server_test.cc +++ b/paddle/fluid/operators/detail/rpc_server_test.cc @@ -98,11 +98,17 @@ void StartServer() { framework::Executor exe(place); platform::CPUDeviceContext ctx(place); auto* block = AppendPrefetchBlcok(&program); - auto prepared = exe.Prepare(program, block->ID()); + std::string in_var_name("ids"); + std::vector prefetch_block_ids{block->ID()}; + auto prepared = exe.Prepare(program, prefetch_block_ids); InitTensorsOnServer(&scope, &place, 10); + std::unordered_map> + prefetch_var_name_to_prepared; + prefetch_var_name_to_prepared[in_var_name] = prepared[0]; g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(std::move(prepared)); + g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); g_req_handler->SetDevCtx(&ctx); g_req_handler->SetScope(&scope); g_req_handler->SetExecutor(&exe); diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index 0803a6035d..12364fff96 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -66,40 +66,41 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(-1) .EqualGreaterThan(-1); AddComment(string::Sprintf(R"DOC( -Limited Elementwise %s Operator. +Limited Elementwise %s Operator The equation is: $$%s$$ -$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be -smaller than or equal to the dimensions of $X$. +- $X$: a tensor of any dimension. +- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: -1. The shape of $Y$ is same with $X$; -2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions - of size 1 for $Y$ will be ignored for the consideration of subsequence. +1. The shape of $Y$ is the same with $X$. +2. The shape of $Y$ is a continuous subsequence of $X$. For case 2: -$Y$ will be broadcasted to match the shape of $X$ and axis should be -set to index of the start dimension to broadcast $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. +2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of + subsequence, such as shape(Y) = (2, 1) => (2). -If axis is -1, it is treated as axis=rank(X)-rank(Y). +For example: -For example .. code-block:: python shape(X) = (2, 3, 4, 5), shape(Y) = (,) shape(X) = (2, 3, 4, 5), shape(Y) = (5,) - shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) + shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2 shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details) -information. However, the output only shares the LoD information with input $X$. +The inputs $X$ and $Y$ can carry the different LoD information. +But the output only shares the LoD information with the input $X$. )DOC", GetName(), GetEquation())); diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 111e58844c..f824eee4e7 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase { client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); } client->Wait(); + for (auto& ep : endpoint_list) { + client->AsyncSendBatchBarrier(ep); + } + client->Wait(); VLOG(3) << "sending completed..."; } diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 7610600440..4d12278799 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -96,19 +96,22 @@ static int64_t GetTimestamp() { return tp.tv_sec * 1000 + tp.tv_usec / 1000; } -void ListenAndServOp::RunSyncLoop(framework::Executor *executor, - framework::ProgramDesc *program, - framework::Scope *recv_scope, - framework::BlockDesc *prefetch_block) const { +void ListenAndServOp::RunSyncLoop( + framework::Executor *executor, framework::ProgramDesc *program, + framework::Scope *recv_scope, + const std::vector &prefetch_block_id_list) const { size_t num_blocks = program->Size(); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); - std::vector block_list; - for (size_t blkid = 1; blkid < num_blocks; ++blkid) { - block_list.push_back(blkid); + std::vector optimize_block_id_list; + for (int blkid = 1; blkid < num_blocks; ++blkid) { + if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(), + blkid) == prefetch_block_id_list.end()) { + optimize_block_id_list.push_back(blkid); + } } - auto optimize_prepared = executor->Prepare(*program, block_list); + auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list); // Insert placeholder for block0 which holds current op itself. optimize_prepared.insert( optimize_prepared.begin(), @@ -135,16 +138,17 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, std::vector parallel_blkids; parallel_blkids.push_back(1); double ts = GetTimestamp(); - for (size_t blkid = 2; blkid < num_blocks; ++blkid) { - if (blkid != static_cast(prefetch_block->ID())) { - if (program->Block(blkid).Parent() != last_parent_blkid) { - ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, - program, recv_scope); - parallel_blkids.clear(); - last_parent_blkid = program->Block(blkid).Parent(); - } - parallel_blkids.push_back(blkid); + for (size_t i = 1; i < optimize_block_id_list.size(); ++i) { + // skip the first optimize block because it is already in the + // parallel_blkids. + int blkid = optimize_block_id_list[i]; + if (program->Block(blkid).Parent() != last_parent_blkid) { + ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, + program, recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); } + parallel_blkids.push_back(blkid); } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); @@ -210,18 +214,19 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, } // while(true) } -static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope, - platform::DeviceContext *dev_ctx, - framework::Executor *executor, - framework::ProgramDesc *program, - framework::ExecutorPrepareContext *prefetch_ctx, - detail::RPCServer *rpc_server) { +static void FillRequestCtx( + detail::RequestHandler *h, framework::Scope *scope, + platform::DeviceContext *dev_ctx, framework::Executor *executor, + framework::ProgramDesc *program, + std::unordered_map> + *prefetch_ctx, + detail::RPCServer *rpc_server) { h->SetScope(scope); h->SetDevCtx(dev_ctx); h->SetExecutor(executor); h->SetProgram(program); - h->SetPrefetchPreparedCtx( - std::unique_ptr(prefetch_ctx)); + h->SetPrefetchPreparedCtx(prefetch_ctx); h->SetRPCServer(rpc_server); } @@ -255,17 +260,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, request_prefetch_handler_.get()); auto *optimize_block = Attr(kOptimizeBlock); - auto *prefetch_block = Attr(kPrefetchBlock); auto *program = optimize_block->Program(); framework::Executor executor(dev_place); // prepare for prefetch - VLOG(3) << "prefetch block id is " << prefetch_block->ID(); - auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID()); + std::vector prefetch_block_id_list; + std::unordered_map block_id_to_prefetch_var_name; + + auto prefetch_var_name_to_block_id_str = + Attr>(kPrefetchVarNameToBlockId); + for (const auto &prefetch_var_name_and_id : + prefetch_var_name_to_block_id_str) { + std::vector pieces; + split(prefetch_var_name_and_id, ':', &pieces); + VLOG(3) << "after split, prefetch_var = " << pieces[0] + << ", id=" << pieces[1]; + PADDLE_ENFORCE_EQ(pieces.size(), 2); + + int block_id = std::stoi(pieces[1]); + prefetch_block_id_list.push_back(block_id); + block_id_to_prefetch_var_name[block_id] = pieces[0]; + } + + auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list); + + std::unordered_map> + prefetch_var_name_to_prepared_ctx; + for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) { + auto block_id = prefetch_block_id_list[i]; + auto prefetch_var_name = block_id_to_prefetch_var_name[block_id]; + prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; + } auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, - &dev_ctx, &executor, program, prefetch_prepared.release(), - rpc_service_.get()); + &dev_ctx, &executor, program, + &prefetch_var_name_to_prepared_ctx, rpc_service_.get()); f(request_send_handler_.get()); f(request_get_handler_.get()); @@ -283,7 +313,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // Write to a file of server selected port for python use. SavePort(); if (sync_mode) { - RunSyncLoop(&executor, program, &recv_scope, prefetch_block); + RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list); } else { RunAsyncLoop(&executor, program); } @@ -309,8 +339,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("sync_mode", "if works at sync_mode or not").SetDefault(true); AddAttr(kOptimizeBlock, "BlockID to run on server side."); - AddAttr(kPrefetchBlock, - "prefetch block to run on server side."); + AddAttr>(kPrefetchVarNameToBlockId, + "prefetch blocks to run on server side.") + .SetDefault({}); AddAttr("Fanin", "How many clients send to this server.") .SetDefault(1); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 87952cb0e6..46c3a19e20 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -30,7 +31,7 @@ namespace paddle { namespace operators { constexpr char kOptimizeBlock[] = "OptimizeBlock"; -constexpr char kPrefetchBlock[] = "PrefetchBlock"; +constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; void RunServer(std::shared_ptr service); @@ -46,7 +47,7 @@ class ListenAndServOp : public framework::OperatorBase { void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, - framework::BlockDesc* prefetch_block) const; + const std::vector& prefetch_block_id_list) const; void RunAsyncLoop(framework::Executor* executor, framework::ProgramDesc* program) const; diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 1a37cb39d5..6207d14ecd 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -20,13 +20,16 @@ #ifdef PADDLE_WITH_MKLML #include #include +#include #include #endif #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -46,6 +49,18 @@ namespace paddle { namespace operators { namespace math { +static void SetNumThreads(int num_threads) { +#ifdef PADDLE_USE_OPENBLAS + int real_num_threads = num_threads > 1 ? num_threads : 1; + openblas_set_num_threads(real_num_threads); +#elif defined(PADDLE_WITH_MKLML) + int real_num_threads = num_threads > 1 ? num_threads : 1; + mkl_set_num_threads(real_num_threads); +#else + PADDLE_ENFORCE(false, "To be implemented."); +#endif +} + /** * Matrix Descriptor of a memory buffer. * diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index d4b0e17ed4..8b296b6a07 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -21,8 +21,10 @@ limitations under the License. */ #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc new file mode 100644 index 0000000000..a60f245f53 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mean_iou_op.h" + +namespace paddle { +namespace operators { + +class MeanIoUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predictions"), + "Input (Predictions) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input (labels) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"), + "Output (OutMeanIou) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutWrong"), + "Output (OutWrong) of MeanIoU op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"), + "Output (OutWrong) of MeanIoU op should not be null."); + + int64_t num_classes = + static_cast(ctx->Attrs().Get("num_classes")); + + ctx->SetOutputDim("OutMeanIou", {1}); + ctx->SetOutputDim("OutWrong", {num_classes}); + ctx->SetOutputDim("OutCorrect", {num_classes}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Predictions")->type()), + ctx.GetPlace()); + } +}; + +class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Predictions", + "(Tensor), A Tensor of prediction results for semantic labels" + " with type int32 or int64. The rank should be greater than 1."); + AddInput( + "Labels", + "(Tensor), A Tensor of ground truth labels with type int32 or int64." + "Its shape should be the same as Input(Predictions)."); + AddInput("InWrongs", + "(vector), A list of Tensor with shape " + "[num_classes]. They are used to collect wrong number among " + "batches. Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddInput( + "InCorrects", + "(vector), A list of Tensor with shape " + "[num_classes]. They are used to collect correct number among batches. " + "Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddInput("InMeanIou", + "(vector), A list of Tensor that Output(mean_iou) should " + "be added to. Empty list is also valid here.") + .AsDuplicable() + .AsDispensable(); + AddOutput("OutMeanIou", + "(vector), A Tensor representing the" + " mean intersection-over-union with shape [1]."); + AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. "); + AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. "); + AddAttr("num_classes", "(int), The possible number of labels."); + + AddComment(R"DOC( +mean-IOU Operator. +Mean Intersection-Over-Union is a common evaluation metric for +semantic image segmentation, which first computes the IOU for each +semantic class and then computes the average over classes. +IOU is defined as follows: + IOU = true_positive / (true_positive + false_positive + false_negative). +It is based on pixel level area while "IOU Similarity Operator" +is based on area of rectangle. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel, + ops::MeanIoUKernel, + ops::MeanIoUKernel); diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu new file mode 100644 index 0000000000..83bb4dde46 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -0,0 +1,164 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/mean_iou_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void CountCUDAKernel(const int num_classes, const int count, + const T* predictions, const T* labels, + int* wrong, int* correct) { + extern __shared__ int blcok_cache[]; + int* wrong_c = blcok_cache; + int* correct_c = blcok_cache + num_classes; + // init cache + for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) { + blcok_cache[i] = 0; + } + __syncthreads(); + + T pred; + T label; + CUDA_1D_KERNEL_LOOP(i, count) { + pred = predictions[i]; + label = labels[i]; + if (pred == label) { + atomicAdd(correct_c + pred, 1); + } else { + atomicAdd(wrong_c + pred, 1); + atomicAdd(wrong_c + label, 1); + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < num_classes; i += blockDim.x) { + atomicAdd(wrong + i, wrong_c[i]); + atomicAdd(correct + i, correct_c[i]); + } +} + +__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong, + int* correct, float* ious, float* iou) { + __shared__ int valid_count_c; + if (threadIdx.x == 0) { + valid_count_c = 0; + } + __syncthreads(); + CUDA_1D_KERNEL_LOOP(i, num_classes) { + int wrong_n = wrong[i]; + int correct_n = correct[i]; + int denominator = wrong_n + correct_n; + if (denominator > 0) { + atomicAdd(&valid_count_c, 1); + ious[i] = static_cast(correct_n) / denominator; + } else { + ious[i] = 0; + } + } + __syncthreads(); + if (threadIdx.x == 0) { + float iou_sum = 0; + for (int i = 0; i < num_classes; ++i) { + iou_sum += ious[i]; + } + iou[0] += iou_sum / valid_count_c; + } +} + +template +class MeanIoUCUDAOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context() + .eigen_device(); + // get input and output tensor + auto* predictions = ctx.Input("Predictions"); + auto* labels = ctx.Input("Labels"); + auto* out_mean_iou = ctx.Output("OutMeanIou"); + auto* out_wrong = ctx.Output("OutWrong"); + auto* out_correct = ctx.Output("OutCorrect"); + int num_classes = static_cast(ctx.Attr("num_classes")); + + // Get data ptr + const T* predictions_data = predictions->data(); + const T* labels_data = labels->data(); + int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); + int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); + float* out_mean_iou_data = + out_mean_iou->mutable_data(ctx.GetPlace()); + + // Get Eigen tensor + auto out_mean_iou_t = EigenTensor::From(*out_mean_iou); + auto out_wrong_t = EigenTensor::From(*out_wrong); + auto out_correct_t = EigenTensor::From(*out_correct); + + // Temporary tensor + Tensor ious; + float* ious_data = ious.mutable_data( + {static_cast(num_classes)}, ctx.GetPlace()); + auto ious_t = EigenTensor::From(ious); + + // Init out_wrong, out_correct and out_mean_iou + out_wrong_t.device(place) = out_wrong_t.constant(0); + out_correct_t.device(place) = out_correct_t.constant(0); + out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f); + + // collect pre wrong, correct and mean_iou + auto in_mean_ious = ctx.MultiInput("InMeanIou"); + for (int i = 0; i < in_mean_ious.size(); ++i) { + out_mean_iou_t.device(place) += + EigenTensor::From(*in_mean_ious[i]); + } + auto in_wrongs = ctx.MultiInput("InWrongs"); + for (int i = 0; i < in_wrongs.size(); ++i) { + out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); + } + auto in_corrects = ctx.MultiInput("InCorrects"); + for (int i = 0; i < in_corrects.size(); ++i) { + out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); + } + // compute + auto stream = ctx.cuda_device_context().stream(); + int block = PADDLE_CUDA_NUM_THREADS; + int grid = (predictions->numel() + block - 1) / block; + int cache_size = (num_classes * 2 + 1) * sizeof(int); + CountCUDAKernel<<>>( + num_classes, predictions->numel(), predictions_data, labels_data, + out_wrong_data, out_correct_data); + ctx.device_context().Wait(); + ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data, + out_correct_data, ious_data, + out_mean_iou_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel, + ops::MeanIoUCUDAOpKernel, + ops::MeanIoUCUDAOpKernel); diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h new file mode 100644 index 0000000000..9fa00e60e0 --- /dev/null +++ b/paddle/fluid/operators/mean_iou_op.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +using EigenTensor = framework::EigenTensor; + +template +class MeanIoUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context() + .eigen_device(); + // get input and output tensor + auto* predictions = ctx.Input("Predictions"); + auto* labels = ctx.Input("Labels"); + auto* out_mean_iou = ctx.Output("OutMeanIou"); + auto* out_wrong = ctx.Output("OutWrong"); + auto* out_correct = ctx.Output("OutCorrect"); + int num_classes = static_cast(ctx.Attr("num_classes")); + + // get data ptr + const T* predictions_data = predictions->data(); + const T* labels_data = labels->data(); + float* out_mean_iou_data = + out_mean_iou->mutable_data(ctx.GetPlace()); + int* out_wrong_data = out_wrong->mutable_data(ctx.GetPlace()); + int* out_correct_data = out_correct->mutable_data(ctx.GetPlace()); + + // get eigen tensor + auto out_mean_iou_t = EigenTensor::From(*out_mean_iou); + auto out_wrong_t = EigenTensor::From(*out_wrong); + auto out_correct_t = EigenTensor::From(*out_correct); + + // Tmp tensor + Tensor denominator; + Tensor valid_count; + Tensor iou_sum; + + // get data ptr of tmp tensor + int* denominator_data = denominator.mutable_data( + {static_cast(num_classes)}, ctx.GetPlace()); + int* valid_count_data = valid_count.mutable_data({1}, ctx.GetPlace()); + float* iou_sum_data = iou_sum.mutable_data({1}, ctx.GetPlace()); + + // get eigen tensor of tmp tensor + auto denominator_t = EigenTensor::From(denominator); + auto valid_count_t = EigenTensor::From(valid_count); + auto iou_sum_t = EigenTensor::From(iou_sum); + + // init out_wrong, out_correct and out_mean_iou + out_wrong_t = out_wrong_t.constant(0); + out_correct_t = out_correct_t.constant(0); + out_mean_iou_t = out_mean_iou_t.constant(0); + + // collect pre wrong, correct and mean_iou + auto in_mean_ious = ctx.MultiInput("InMeanIou"); + for (size_t i = 0; i < in_mean_ious.size(); ++i) { + out_mean_iou_t.device(place) += + EigenTensor::From(*in_mean_ious[i]); + } + auto in_wrongs = ctx.MultiInput("InWrongs"); + for (size_t i = 0; i < in_wrongs.size(); ++i) { + out_wrong_t.device(place) += EigenTensor::From(*in_wrongs[i]); + } + auto in_corrects = ctx.MultiInput("InCorrects"); + for (size_t i = 0; i < in_corrects.size(); ++i) { + out_correct_t.device(place) += EigenTensor::From(*in_corrects[i]); + } + + // compute + for (int64_t i = 0; i < predictions->numel(); ++i) { + if (predictions_data[i] == labels_data[i]) { + out_correct_data[predictions_data[i]] += 1; + } else { + out_wrong_data[labels_data[i]] += 1; + out_wrong_data[predictions_data[i]] += 1; + } + } + + denominator_t = out_wrong_t + out_correct_t; + valid_count_t = + (denominator_t > denominator_t.constant(0.0f)).cast().sum(); + + for (int i = 0; i < num_classes; ++i) { + if (denominator_data[i] == 0) { + denominator_data[i] = 1; + } + } + + iou_sum_t = + (out_correct_t.cast() / denominator_t.cast()).sum(); + out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc new file mode 100644 index 0000000000..c6ec4ab047 --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_ids_op.h" + +namespace paddle { +namespace operators { + +class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); + AddInput( + "X", + "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the " + "size of embedding table") + .AsDuplicable(); + AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors."); + + AddComment(R"DOC( +Merge multi LoDTensor's into one according to Ids's shard num. + + +split_ids_op -> prefetch_op -> merge_ids_op + + +merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op + will split input Ids into multiple tensors according to Id's shard number. +prefetch_op will send them to parameter server to prefetch embedding value +back. During split, the order of ids is disordered. In merge_ids_op we use +the original Ids to restore the order of the fetched embedding value and + also pass the lod information to the merged output. + + +Example: + + Ids = [1,2,3,4,5,6] # 3 shared + +split_ids_op -> + + Id0 = [3, 6] # id % 3 == 0 + Id1 = [1, 4] # id % 3 == 1 + Id2 = [2, 5] # id % 3 == 2 + +prefetch_op -> + + X0 = [[0.3 0.3] # 3 + [0.6 0.6]] # 6 + X1 = [[0.1 0.1] # 1 + [0.4 0.4]] # 4 + X2 = [[0.2 0.2] # 2 + [0.5 0.5]] # 5 + +merge_ids_op -> + + Out = [[0.1 0.1] # 1 + [0.2 0.2] # 2 + [0.3 0.3] # 3 + [0.4 0.4] # 4 + [0.5 0.5] # 5 + [0.6 0.6]] # 6 +)DOC"); + } +}; + +class MergeIdsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids."); + PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out."); + + auto ids_var_type = ctx->GetInputsVarType("Ids").front(); + auto ids_dims = ctx->GetInputDim("Ids"); + if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } + auto x_var_type = ctx->GetInputsVarType("X"); + for (auto &var_type : x_var_type) { + PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR, + "input X only support lod tensors"); + } + ctx->ShareLoD("Ids", "Out"); + } + + private: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.MultiInput("X").front()->type()), + ctx.GetPlace()); + } +}; + +class MergeIdsOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto *input_var = block->Var(op_desc.Input("Ids")[0]); + for (auto &out_var : op_desc.Output("Out")) { + block->Var(out_var)->SetType(input_var->GetType()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker, + ops::MergeIdsOpInferVarType); +REGISTER_OP_CPU_KERNEL( + merge_ids, ops::MergeIdsOpKernel); diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h new file mode 100644 index 0000000000..83712a8519 --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class MergeIdsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + if (!platform::is_cpu_place(place)) { + PADDLE_THROW("MergeIds do not support GPU kernel"); + } + VLOG(3) << "run in MergeIdsOpKernel"; + + const auto *ids_var = ctx.InputVar("Ids"); + PADDLE_ENFORCE(ids_var->IsType(), + "only support to merge Ids of LoDTensor"); + + const auto &ids_tensor = ids_var->Get(); + const auto &ids_dims = ids_tensor.dims(); + const int64_t *ids = ids_tensor.data(); + + auto x_tensors = ctx.MultiInput("X"); + + auto *out = ctx.Output("Out"); + + int batch_size = 0; + int embedding_size = 0; + for (auto &input : x_tensors) { + if (framework::product(input->dims()) != 0) { + if (embedding_size == 0) { + embedding_size = input->dims()[1]; + } + PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1], + "embedding size of all input should be the same"); + batch_size += input->dims()[0]; + } + } + PADDLE_ENFORCE_EQ( + batch_size, ids_dims[0], + "the batch size of ids and merged embedding value should be the same"); + + const size_t shard_num = x_tensors.size(); + + if (shard_num == 1) { + VLOG(3) << "only one shard, we can copy the data directly"; + TensorCopy(*x_tensors[0], place, out); + } else { + std::vector in_indexs(shard_num, 0); + auto *out_data = out->mutable_data( + framework::make_ddim({batch_size, embedding_size}), place); + // copy data from ins[shard_num] to out. + for (int i = 0; i < ids_dims[0]; ++i) { + int64_t id = ids[i]; + size_t shard_id = static_cast(id) % shard_num; + int index = in_indexs[shard_id]; + memcpy(out_data + embedding_size * i, + x_tensors[shard_id]->data() + index * embedding_size, + sizeof(T) * embedding_size); + in_indexs[shard_id] += 1; + } + + for (size_t i = 0; i < shard_num; ++i) { + PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0], + "after merge, all data in x_tensor should be used"); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index 4cc7cbc6e8..ecbae3894d 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -20,7 +20,7 @@ namespace reader { class BatchReader : public framework::DecoratedReader { public: - BatchReader(ReaderBase* reader, int batch_size) + BatchReader(const std::shared_ptr& reader, int batch_size) : DecoratedReader(reader), batch_size_(batch_size) { buffer_.reserve(batch_size_); } diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc index 331224a598..0a02fcdeaa 100644 --- a/paddle/fluid/operators/reader/create_custom_reader_op.cc +++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc @@ -22,7 +22,8 @@ namespace reader { class CustomReader : public framework::DecoratedReader { public: - CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block, + CustomReader(const std::shared_ptr& reader, + const framework::BlockDesc& sub_block, const std::vector& source_var_names, const std::vector& sink_var_names) : DecoratedReader(reader), diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index bc830a2b72..5f35b9b3ea 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1; // kCacheSize - 2 class DoubleBufferReader : public framework::DecoratedReader { public: explicit DoubleBufferReader( - ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) + const std::shared_ptr& reader, + platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { cpu_tensor_cache_.resize(kCacheSize); gpu_tensor_cache_.resize(kCacheSize); diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc index 249b0b7c6d..19b54110b9 100644 --- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc +++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc @@ -21,7 +21,7 @@ namespace reader { class MultiPassReader : public framework::DecoratedReader { public: - MultiPassReader(ReaderBase* reader, int pass_num) + MultiPassReader(const std::shared_ptr& reader, int pass_num) : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {} void ReadNext(std::vector* out) override { diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index fd233be945..57e8e21214 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -23,7 +23,8 @@ namespace reader { class ShuffleReader : public framework::DecoratedReader { public: - ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0) + ShuffleReader(const std::shared_ptr& reader, size_t buffer_size, + size_t seed = 0) : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { VLOG(10) << "Create shuffle reader of " << reader_; if (seed_ == 0) { diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc index 1db70f3e96..3798015146 100644 --- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc +++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc @@ -21,7 +21,8 @@ namespace reader { class ThreadedReader : public framework::DecoratedReader { public: - explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {} + explicit ThreadedReader(const std::shared_ptr& reader) + : DecoratedReader(reader) {} void ReadNext(std::vector* out) override { std::lock_guard lock(mutex_); diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 4fc9aae8e3..40dc7c9a0b 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -21,12 +21,17 @@ limitations under the License. */ #include #endif +#include #include "gflags/gflags.h" DEFINE_double(fraction_of_cpu_memory_to_use, 1, "Default use 100% of CPU memory for PaddlePaddle," "reserve the rest for page tables, etc"); +DEFINE_uint64( + initial_cpu_memory_in_mb, 500, + "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit."); + DEFINE_double( fraction_of_cuda_pinned_memory_to_use, 0.5, "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," @@ -54,7 +59,10 @@ inline size_t CpuTotalPhysicalMemory() { size_t CpuMaxAllocSize() { // For distributed systems, it requires configuring and limiting // the fraction of memory to use. - return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); + return std::min( + static_cast(FLAGS_fraction_of_cpu_memory_to_use * + CpuTotalPhysicalMemory()), + static_cast(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); } size_t CpuMinChunkSize() { diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 1a9be044e0..d9e2afadaf 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer { DisableActivity(); dynload::cuptiUnsubscribe(subscriber_); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); - PADDLE_ENFORCE(dynload::cuptiFinalize()); enabled_ = false; } diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index 2ad52bc7d3..e8f4a82ef1 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -72,7 +72,6 @@ extern void *cupti_dso_handle; __macro(cuptiGetResultString); \ __macro(cuptiActivityGetNumDroppedRecords); \ __macro(cuptiActivityFlushAll); \ - __macro(cuptiFinalize); \ __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ __macro(cuptiEnableCallback); \ diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6f8e3f22db..cc46c88fd1 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { } } +// NOTE(minqiyang): according to the ncclGroupEnd documentations: +// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, +// ncclGroupEnd will wait for all communicators to be initialized, which will +// cause blocking problem when a runtime_error was thrown, so try only guard +// NCCL actions when use it. class NCCLGroupGuard { public: static std::mutex &NCCLMutex() { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c88fbef63c..bd5c613f8c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -413,6 +413,9 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init()) +#ifdef PADDLE_WITH_DISTRIBUTE + .def("complete", &Executor::Complete) +#endif .def("run", (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) & Executor::Run); diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index f3d8b1a39e..854e4baa39 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef MATHFUNCTIONS_H_ -#define MATHFUNCTIONS_H_ +#pragma once #ifdef PADDLE_WITH_MKLML #include @@ -21,7 +20,7 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_USE_VECLIB) +#ifdef PADDLE_USE_VECLIB extern "C" { #include #include @@ -30,8 +29,10 @@ extern "C" { #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -126,5 +127,3 @@ template void vTanh(const int n, const T* a, T* r); } // namespace paddle - -#endif // MATHFUNCTIONS_H_ diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index c6eef8683d..e8b3053267 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -132,7 +132,8 @@ EOF -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ - -DWITH_CONTRIB=${WITH_CONTRIB:-ON} + -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ + -DWITH_ANAKIN=ON } function abort(){ diff --git a/python/paddle/batch.py b/python/paddle/batch.py index d48c54fcbb..3c6a53db3c 100644 --- a/python/paddle/batch.py +++ b/python/paddle/batch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader. diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index bbd35aaecb..f6438c82ac 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -382,7 +382,7 @@ class Operator(object): 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'channel_create', 'channel_close', 'channel_send', - 'channel_recv', 'select' + 'channel_recv', 'select', 'gen_nccl_id' } def __init__(self, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9e2c06d26f..2c1f988828 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -All layers just related to the neural network. +All layers just related to the neural network. """ from ..layer_helper import LayerHelper @@ -25,68 +25,20 @@ import utils import random __all__ = [ - 'fc', - 'embedding', - 'dynamic_lstm', - 'dynamic_lstmp', - 'dynamic_gru', - 'gru_unit', - 'linear_chain_crf', - 'crf_decoding', - 'cos_sim', - 'cross_entropy', - 'square_error_cost', - 'chunk_eval', - 'sequence_conv', - 'conv2d', - 'sequence_pool', - 'sequence_softmax', - 'softmax', - 'pool2d', - 'batch_norm', - 'beam_search_decode', - 'conv2d_transpose', - 'sequence_expand', - 'lstm_unit', - 'reduce_sum', - 'reduce_mean', - 'reduce_max', - 'reduce_min', - 'reduce_prod', - 'sequence_first_step', - 'sequence_last_step', - 'dropout', - 'split', - 'ctc_greedy_decoder', - 'edit_distance', - 'l2_normalize', - 'matmul', - 'topk', - 'warpctc', - 'sequence_reshape', - 'transpose', - 'im2sequence', - 'nce', - 'beam_search', - 'row_conv', - 'multiplex', - 'layer_norm', - 'softmax_with_cross_entropy', - 'smooth_l1', - 'one_hot', - 'autoincreased_step_counter', - 'reshape', - 'lod_reset', - 'lrn', - 'pad', - 'label_smooth', - 'roi_pool', - 'dice_loss', - 'image_resize', - 'image_resize_short', - 'resize_bilinear', - 'gather', - 'random_crop', + 'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', + 'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy', + 'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', + 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm', + 'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit', + 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod', + 'sequence_first_step', 'sequence_last_step', 'dropout', 'split', + 'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk', + 'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce', + 'beam_search', 'row_conv', 'multiplex', 'layer_norm', + 'softmax_with_cross_entropy', 'smooth_l1', 'one_hot', + 'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad', + 'label_smooth', 'roi_pool', 'dice_loss', 'image_resize', + 'image_resize_short', 'resize_bilinear', 'gather', 'random_crop', 'mean_iou' ] @@ -95,7 +47,6 @@ def fc(input, num_flatten_dims=1, param_attr=None, bias_attr=None, - use_cudnn=False, use_mkldnn=False, act=None, is_test=False, @@ -222,6 +173,7 @@ def embedding(input, have two elements which indicate the size of the dictionary of embeddings and the size of each embedding vector respectively. is_sparse(bool): The flag indicating whether to use sparse update. + is_distributed (bool): Whether to run lookup table from remote parameter server. padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup. Otherwise the given :attr:`padding_idx` indicates padding the output with zeros whenever lookup encounters it in :attr:`input`. If @@ -261,9 +213,10 @@ def embedding(input, return tmp -# TODO(qijun): expose H0 and C0 def dynamic_lstm(input, size, + h_0=None, + c_0=None, param_attr=None, bias_attr=None, use_peepholes=True, @@ -324,6 +277,13 @@ def dynamic_lstm(input, (T X 4D), where T is the total time steps in this mini-batch, D is the hidden size. size(int): 4 * hidden size. + h_0(Variable): The initial hidden state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size and D is the hidden size. + c_0(Variable): The initial cell state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size. `h_0` and `c_0` can be NULL but only at the same time. + param_attr(ParamAttr|None): The parameter attribute for the learnable hidden-hidden weights. @@ -387,12 +347,20 @@ def dynamic_lstm(input, cell = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype) batch_cell_pre_act = helper.create_tmp_variable(dtype) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + batch_size = input.shape[0] + if h_0: + assert h_0.shape == (batch_size, size), \ + 'The shape of h0 should be (batch_size, %d)' % size + inputs['H0'] = h_0 + if c_0: + assert c_0.shape == (batch_size, size), \ + 'The shape of c0 should be (batch_size, %d)' % size + inputs['C0'] = c_0 helper.append_op( type='lstm', - inputs={'Input': input, - 'Weight': weight, - 'Bias': bias}, + inputs=inputs, outputs={ 'Hidden': hidden, 'Cell': cell, @@ -654,8 +622,9 @@ def dynamic_gru(input, :attr:`False`. gate_activation(str): The activation for update gate and reset gate. Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid". - activation(str): The activation for candidate hidden state. + candidate_activation(str): The activation for candidate hidden state. Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". + h_0 (Variable): The hidden output of the first time step. Returns: Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \ @@ -676,11 +645,13 @@ def dynamic_gru(input, attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) bias = helper.create_parameter( attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) + batch_size = input.shape[0] inputs = {'Input': input, 'Weight': weight, 'Bias': bias} if h_0 != None: assert h_0.shape == ( - size, size), 'The shape of h0 should be(%d, %d)' % (size, size) - inputs['h0'] = h_0 + batch_size, size + ), 'The shape of h0 should be(batch_size, %d)' % size + inputs['H0'] = h_0 hidden = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype) @@ -873,6 +844,13 @@ def cos_sim(X, Y): """ This function performs the cosine similarity between two tensors X and Y and returns that as the output. + + Args: + X (Variable): The input X. + Y (Variable): The input Y. + + Returns: + Variable: the output of cosine(X, Y). """ helper = LayerHelper('cos_sim', **locals()) out = helper.create_tmp_variable(dtype=X.dtype) @@ -899,15 +877,15 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): unchanged. Args: - x(variable): The input tensor. - dropout_prob(float): Probability of setting units to zero. - is_test(bool): A flag indicating whether it is in test phrase or not. - seed(int): A Python integer used to create random seeds. If this - parameter is set to None, a random seed is used. - NOTE: If an integer seed is given, always the same output - units will be dropped. DO NOT use a fixed seed in training. - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + x (Variable): The input tensor. + dropout_prob (float): Probability of setting units to zero. + is_test (bool): A flag indicating whether it is in test phrase or not. + seed (int): A Python integer used to create random seeds. If this + parameter is set to None, a random seed is used. + NOTE: If an integer seed is given, always the same output + units will be dropped. DO NOT use a fixed seed in training. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: Variable: A tensor variable. @@ -1029,8 +1007,8 @@ def square_error_cost(input, label): * :math:`Out`: Output value, same shape with :math:`X`. Args: - input(Variable): Input tensor, has predictions. - label(Variable): Label tensor, has target labels. + input (Variable): Input tensor, has predictions. + label (Variable): Label tensor, has target labels. Returns: Variable: The tensor variable storing the element-wise squared error \ @@ -1059,6 +1037,7 @@ def square_error_cost(input, label): return square_out +@templatedoc() def chunk_eval(input, label, chunk_scheme, @@ -1067,6 +1046,18 @@ def chunk_eval(input, """ This function computes and outputs the precision, recall and F1-score of chunk detection. + + Args: + input (Variable): prediction output of the network. + label (Variable): label of the test data set. + chunk_scheme (str): ${chunk_scheme_comment} + num_chunk_types (int): ${num_chunk_types_comment} + excluded_chunk_types (list): ${excluded_chunk_types_comment} + + Returns: + tuple: tuple containing: (precision, recall, f1_score, + num_infer_chunks, num_label_chunks, + num_correct_chunks) """ helper = LayerHelper("chunk_eval", **locals()) @@ -1099,6 +1090,7 @@ def chunk_eval(input, num_correct_chunks) +@templatedoc() def sequence_conv(input, num_filters, filter_size=3, @@ -1111,6 +1103,19 @@ def sequence_conv(input, This function creates the op for sequence_conv, using the inputs and other convolutional configurations for the filters and stride as given in the input parameters to the function. + + Args: + input (Variable): ${x_comment} + num_filters (int): number of filters. + filter_size (int): the filter size (H and W). + filter_stride (int): stride of the filter. + padding (bool): if True, add paddings. + bias_attr (ParamAttr|None): attributes for bias + param_attr (ParamAttr|None): attributes for parameter + act (str): the activation type + + Returns: + Variable: output of sequence_conv """ # FIXME(dzh) : want to unify the argument of python layer @@ -1225,33 +1230,34 @@ def conv2d(input, W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 Args: - input(Variable): The input image with [N, C, H, W] format. - num_filters(int): The number of filter. It is as same as the output - image channel. - filter_size(int|tuple|None): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_H, filter_size_W). - Otherwise, the filter will be a square. - stride(int|tuple): The stride size. If stride is a tuple, it must - contain two integers, (stride_H, stride_W). Otherwise, the - stride_H = stride_W = stride. Default: stride = 1. - padding(int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_H, padding_W). Otherwise, the - padding_H = padding_W = padding. Default: padding = 0. - dilation(int|tuple): The dilation size. If dilation is a tuple, it must - contain two integers, (dilation_H, dilation_W). Otherwise, the - dilation_H = dilation_W = dilation. Default: dilation = 1. - groups(int): The groups number of the Conv2d Layer. According to grouped - convolution in Alex Krizhevsky's Deep CNN paper: when group=2, - the first half of the filters is only connected to the first half - of the input channels, while the second half of the filters is only - connected to the second half of the input channels. Default: groups=1 - param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None - bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None - use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act(str): Activation type. Default: None - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + input (Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None + bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + use_mkldnn (bool): Use mkldnn kernels or not. + act (str): Activation type. Default: None + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: Variable: The tensor variable storing the convolution and \ @@ -1409,7 +1415,7 @@ def sequence_pool(input, pool_type): def sequence_first_step(input): """ - This funciton get the first step of sequence. + This function gets the first step of sequence. .. code-block:: text @@ -1442,7 +1448,7 @@ def sequence_first_step(input): def sequence_last_step(input): """ - This funciton get the last step of sequence. + This function gets the last step of sequence. .. code-block:: text @@ -1486,6 +1492,22 @@ def pool2d(input, """ This function adds the operator for pooling in 2 dimensions, using the pooling configurations mentioned in input parameters. + + Args: + input (Variable): ${input_comment} + pool_size (int): ${ksize_comment} + pool_type (str): ${pooling_type_comment} + pool_stride (int): stride of the pooling layer. + pool_padding (int): padding size. + global_pooling (bool): ${global_pooling_comment} + use_cudnn (bool): ${use_cudnn_comment} + ceil_mode (bool): ${ceil_mode_comment} + use_mkldnn (bool): ${use_mkldnn_comment} + name (str): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: output of pool2d layer. """ if pool_type not in ["max", "avg"]: raise ValueError( @@ -1543,6 +1565,25 @@ def batch_norm(input, """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. + + Args: + input (Variable): the input variable. + act (str): activation type + is_test (bool): whether to run batch_norm as test mode. + momentum (float): momentum + epsilon (float): epsilon, default 1e-05 + param_attr (ParamAttr|None): attributes for parameter + bias_attr (ParamAttr|None): attributes for bias + data_layout (str): data layout, default NCHW + in_place (bool): if True, do not create tmp variable + use_mkldnn (bool): ${use_mkldnn_comment} + name (str): The name of this layer. It is optional. + moving_mean_name (str): The name of moving mean variable name, optional. + moving_variance_name (str): The name of moving variance name, optional. + do_model_average_for_mean_and_var (bool): + + Returns: + Variable: output of batch_norm layer. """ helper = LayerHelper('batch_norm', **locals()) dtype = helper.input_dtype() @@ -1670,6 +1711,7 @@ def layer_norm(input, bias_attr(ParamAttr|None): The parameter attribute for the learnable bias :math:`b`. act(str): Activation to be applied to the output of layer normalizaiton. + name (str): The name of this layer. It is optional. Returns: Variable: A tensor variable with the same shape as the input. @@ -1721,6 +1763,17 @@ def layer_norm(input, def beam_search_decode(ids, scores, name=None): + """ + ${beam_search_decode} + + Args: + ids (Variable): ${ids_comment} + scores (Variable): ${scores_comment} + name (str): The name of this layer. It is optional. + + Returns: + tuple: a tuple of two output variable: sentence_ids, sentence_scores + """ helper = LayerHelper('beam_search_decode', **locals()) sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) sentence_scores = helper.create_tmp_variable(dtype=ids.dtype) @@ -1796,46 +1849,46 @@ def conv2d_transpose(input, W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 Args: - input(Variable): The input image with [N, C, H, W] format. - num_filters(int): The number of the filter. It is as same as the output - image channel. - output_size(int|tuple|None): The output image size. If output size is a - tuple, it must contain two integers, (image_H, image_W). This - parameter only works when filter_size is None. - filter_size(int|tuple|None): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_H, filter_size_W). - Otherwise, the filter will be a square. None if use output size to - calculate filter_size. - padding(int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_H, padding_W). Otherwise, the - padding_H = padding_W = padding. Default: padding = 0. - stride(int|tuple): The stride size. If stride is a tuple, it must - contain two integers, (stride_H, stride_W). Otherwise, the - stride_H = stride_W = stride. Default: stride = 1. - dilation(int|tuple): The dilation size. If dilation is a tuple, it must - contain two integers, (dilation_H, dilation_W). Otherwise, the - dilation_H = dilation_W = dilation. Default: dilation = 1. - groups(int): The groups number of the Conv2d transpose layer. Inspired by - grouped convolution in Alex Krizhevsky's Deep CNN paper, in which - when group=2, the first half of the filters is only connected to the - first half of the input channels, while the second half of the - filters is only connected to the second half of the input channels. - Default: groups=1 - param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. - Default: None - bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None - use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act(str): Activation type. Default: None - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + input(Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple|None): The output image size. If output size is a + tuple, it must contain two integers, (image_H, image_W). This + parameter only works when filter_size is None. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. None if use output size to + calculate filter_size. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + dilation(int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups(int): The groups number of the Conv2d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups=1 + param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. + Default: None + bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None + use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act(str): Activation type. Default: None + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: - Variable: The tensor variable storing the convolution transpose result. + Variable: The tensor variable storing the convolution transpose result. Raises: - ValueError: If the shapes of input, filter_size, stride, padding and - groups mismatch. + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. Examples: .. code-block:: python @@ -1972,6 +2025,17 @@ def sequence_expand(x, y, ref_level=-1, name=None): def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): ''' This function implements the beam search algorithm. + + Args: + pre_ids (Variable): ${pre_ids_comment} + ids (Variable): ${ids_comment} + scores (Variable): ${scores_comment} + beam_size (int): ${beam_size_comment} + end_id (int): ${end_id_comment} + level (int): ${level_comment} + + Returns: + tuple: a tuple of beam_search output variables: selected_ids, selected_scores ''' helper = LayerHelper('beam_search', **locals()) score_type = scores.dtype @@ -2474,14 +2538,14 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): slice along dimension `axis`. Args: - x(Variable|list): The input tensor to l2_normalize layer. - axis(int): The axis on which to apply normalization. If `axis < 0`, - the dimension to normalization is rank(X) + axis. -1 is the - last dimension. - epsilon(float): The epsilon value is used to avoid division by zero, - the defalut value is 1e-10. - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + x(Variable|list): The input tensor to l2_normalize layer. + axis(int): The axis on which to apply normalization. If `axis < 0`, + the dimension to normalization is rank(X) + axis. -1 is the + last dimension. + epsilon(float): The epsilon value is used to avoid division by zero, + the defalut value is 1e-10. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: @@ -2694,16 +2758,13 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None, the edit distance will be divided by the length of reference string. Args: - input(Variable): The indices for hypothesis strings. - label(Variable): The indices for reference strings. - normalized(bool): Indicated whether to normalize the edit distance by the length of reference string. - ignored_tokens(list of int): Tokens that should be removed before calculating edit distance. + name (str): The name of this layer. It is optional. Returns: Variable: sequence-to-sequence edit distance in shape [batch_size, 1]. @@ -2793,10 +2854,10 @@ def ctc_greedy_decoder(input, blank, name=None): where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label). - blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1). + name (str): The name of this layer. It is optional. Returns: Variable: CTC greedy decode result. If all the sequences in result were @@ -2833,23 +2894,23 @@ def warpctc(input, label, blank=0, norm_by_times=False): input tensor. Args: - input(Variable): (LodTensor, default: LoDTensor), - the unscaled probabilities of variable-length sequences, - which is a 2-D Tensor with LoD information. - It's shape is [Lp, num_classes + 1], where Lp is the sum of all input - sequences' length and num_classes is the true number of classes. - (not including the blank label). - label(Variable): (LodTensor, default: LoDTensor), the ground truth - of variable-length sequence, which is a 2-D Tensor with LoD - information. It is of the shape [Lg, 1], where Lg is th sum of - all labels' length. - blank: (int, default: 0), the blank label index of Connectionist - Temporal Classification (CTC) loss, which is in the - half-opened interval [0, num_classes + 1). - norm_by_times: (bool, default: false), whether to normalize - the gradients by the number of time-step, which is also the - sequence's length. There is no need to normalize the gradients - if warpctc layer was follewed by a mean_op. + input(Variable): (LodTensor, default: LoDTensor), + the unscaled probabilities of variable-length sequences, + which is a 2-D Tensor with LoD information. + It's shape is [Lp, num_classes + 1], where Lp is the sum of all input + sequences' length and num_classes is the true number of classes. + (not including the blank label). + label(Variable): (LodTensor, default: LoDTensor), the ground truth + of variable-length sequence, which is a 2-D Tensor with LoD + information. It is of the shape [Lg, 1], where Lg is th sum of + all labels' length. + blank (int): default 0, the blank label index of Connectionist + Temporal Classification (CTC) loss, which is in the + half-opened interval [0, num_classes + 1). + norm_by_times (bool): default false, whether to normalize + the gradients by the number of time-step, which is also the + sequence's length. There is no need to normalize the gradients + if warpctc layer was follewed by a mean_op. Returns: Variable: The Connectionist Temporal Classification (CTC) loss, @@ -2908,9 +2969,9 @@ def sequence_reshape(input, new_dim): no remainder for each sequence. Args: - input (Variable): (LodTensor, default: LoDTensor), a 2-D LoDTensor - with shape being [N, M] where M for dimension. - new_dim (int): New dimension which the input LoDTensor is reshaped to. + input (Variable): (LodTensor, default: LoDTensor), a 2-D LoDTensor + with shape being [N, M] where M for dimension. + new_dim (int): New dimension which the input LoDTensor is reshaped to. Returns: Variable: Reshaped LoDTensor according to new dimension. @@ -2932,7 +2993,10 @@ def sequence_reshape(input, new_dim): return out -@autodoc() +# FIXME(wuyi): let docstring_checker.py understand @autodoc. +# For now, the comments in c++ use types like Tensor, but in python side +# the type is often "Variable", and arguments may vary. +@templatedoc(op_type="nce") def nce(input, label, num_total_classes, @@ -2940,6 +3004,21 @@ def nce(input, param_attr=None, bias_attr=None, num_neg_samples=None): + """ + ${comment} + + Args: + input (Variable): input variable. + label (Variable): label. + num_total_classes (int):${num_total_classes_comment} + sample_weight (int): ${sample_weight_comment} + param_attr (ParamAttr|None): attributes for parameter + bias_attr (ParamAttr|None): attributes for bias + num_neg_samples (int): ${num_neg_samples_comment} + + Returns: + Variable: output of nce layer. + """ helper = LayerHelper('nce', **locals()) assert isinstance(input, Variable) dim = input.shape[1] @@ -2997,8 +3076,9 @@ def transpose(x, perm, name=None): perm[i]-th dimension of `input`. Args: - input (Variable): (Tensor), A Tensor. - perm (list): A permutation of the dimensions of `input`. + x (Variable): The input Tensor. + perm (list): A permutation of the dimensions of `input`. + name (str): The name of this layer. It is optional. Returns: Variable: A transposed Tensor. @@ -3231,9 +3311,9 @@ def multiplex(inputs, index): row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`. Args: - inputs (list): A list of variables to gather from. All variables have the + inputs (list): A list of variables to gather from. All variables have the same shape and the rank is at least 2. - index (Variable): Tensor, index variable which is a 2-D tensor + index (Variable): Tensor, index variable which is a 2-D tensor with shape [M, 1] where M is the batch size. Returns: @@ -3432,7 +3512,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): begin(int): The first value of this counter. step(int): The increment step between each execution. - Returns(Variable): The global run counter. + Returns: + Variable: The global run counter. """ helper = LayerHelper('global_step_counter') if counter_name is None: @@ -3493,7 +3574,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): the corresponding dimension of x. Args: - input(variable): The input tensor. + x(variable): The input tensor. shape(list): The new shape. At most one dimension of the new shape can be -1. actual_shape(variable): An optional input. If provided, reshape @@ -3505,8 +3586,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): inplace(bool): If this flag is set true, a new output tensor is created whose data is copied from input x, otherwise the output shares data with input without copying. + name (str): The name of this layer. It is optional. - Returns(variable): The output tensor. + Returns: + Variable: The output tensor. Examples: .. code-block:: python @@ -4027,7 +4110,6 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None): name(str|None): The output variable name. Returns: - ${out_comment}. """ @@ -4046,6 +4128,7 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'): This is a 4-D tensor of the shape (num_batches, channels, in_h, in_w). out_short_len(int): The length of output images' short edge. + resample (str): resample method, default: BILINEAR. Returns: out (Variable): The output is a 4-D tensor of the shape @@ -4100,6 +4183,7 @@ def gather(input, index): output (Variable): The output is a tensor with the same rank as input. Examples: + .. code-block:: python output = fluid.layers.gather(x, index) @@ -4158,9 +4242,59 @@ def random_crop(x, shape, seed=None): seed_out = helper.create_tmp_variable(dtype="int64") helper.append_op( type="random_crop", - inputs={"X": input, + inputs={"X": x, "Seed": seed}, outputs={"Out": out, "SeedOut": seed_out}, attrs={"shape": shape}) return out + + +def mean_iou(input, label, num_classes): + """ + Mean Intersection-Over-Union is a common evaluation metric for + semantic image segmentation, which first computes the IOU for each + semantic class and then computes the average over classes. + IOU is defined as follows: + + .. math:: + + IOU = true_positive / (true_positive + false_positive + false_negative). + + The predictions are accumulated in a confusion matrix and mean-IOU + is then calculated from it. + + + Args: + input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64. + label (Variable): A Tensor of ground truth labels with type int32 or int64. + Its shape should be the same as input. + + Returns: + mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1]. + out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class. + out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. + + + Examples: + + .. code-block:: python + + iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes) + """ + helper = LayerHelper('mean_iou', **locals()) + dtype = helper.input_dtype() + out_mean_iou = helper.create_tmp_variable(dtype='float32') + out_wrong = helper.create_tmp_variable(dtype='int32') + out_correct = helper.create_tmp_variable(dtype='int32') + helper.append_op( + type="mean_iou", + inputs={"predictions": input, + "labels": label}, + outputs={ + "out_mean_iou": out_mean_iou, + "out_wrong": out_wrong, + "out_correct": out_correct + }, + attrs={"num_classes": num_classes}) + return out_mean_iou, out_wrong, out_correct diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index 2df3da9cca..8e222d2690 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index 224cca417e..dbc7bc06c9 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py index 113dda88ca..8c74be0f08 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py @@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname): def event_handler(event): if isinstance(event, fluid.EndEpochEvent): test_reader = paddle.batch( - paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + paddle.dataset.imdb.test(word_dict), + batch_size=BATCH_SIZE, + drop_last=False) avg_cost, acc = trainer.test( reader=test_reader, feed_order=['words', 'label']) @@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.imdb.train(word_dict), buf_size=25000), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) trainer.train( num_epochs=1, diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index 8818cf96fa..be347cd531 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -56,7 +56,7 @@ BATCH_SIZE = 200 # fix the order of training data train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE) + paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False) # train_reader = paddle.batch( # paddle.reader.shuffle( diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py new file mode 100644 index 0000000000..64d42b693b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py @@ -0,0 +1,114 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +import unittest +import numpy as np +from op_test import OpTest + + +def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects, + in_mean_ious): + assert predictions.shape == labels.shape + predictions = predictions.flatten() + labels = labels.flatten() + + out_wrong = np.zeros([num_classes]).astype("int32") + for _, wrong in in_wrongs: + out_wrong += wrong + out_correct = np.zeros([num_classes]).astype("int32") + for _, correct in in_corrects: + out_correct += correct + + for pred, label in zip(predictions, labels): + if pred == label: + out_correct[pred] += 1 + else: + out_wrong[pred] += 1 + out_wrong[label] += 1 + + denominator = out_wrong + out_correct + valid_count = (denominator != 0).sum() + denominator = np.where(denominator > 0, denominator, + np.ones(denominator.shape)) + mean_iou = (out_correct / denominator).sum() / valid_count + + for _, in_mean_iou in in_mean_ious: + mean_iou += in_mean_iou + return mean_iou, out_wrong, out_correct + + +class TestMeanIOUOp(OpTest): + def setUp(self): + self.config() + self.op_type = "mean_iou" + predictions = np.random.randint(0, self.num_classes, + self.image_size).astype("int32") + labels = np.random.randint(0, self.num_classes, + self.image_size).astype("int32") + + in_wrongs = [] + for i in range(self.in_wrong_num): + in_wrongs.append(("in_wrong_%d" % i, np.random.randint( + 0, 10, [self.num_classes]).astype("int32"))) + + in_corrects = [] + for i in range(self.in_correct_num): + in_corrects.append(("in_correct_%d" % i, np.random.randint( + 0, 10, [self.num_classes]).astype("int32"))) + + in_mean_ious = [] + for i in range(self.in_mean_iou_num): + in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform( + 0, 1, [1]).astype("float32"))) + + self.inputs = { + 'Predictions': predictions, + 'Labels': labels, + 'InWrongs': in_wrongs, + 'InCorrects': in_corrects, + 'InMeanIou': in_mean_ious + } + self.attrs = {'num_classes': long(self.num_classes)} + mean_iou, out_wrong, out_correct = compute_mean_iou( + predictions, labels, self.num_classes, in_wrongs, in_corrects, + in_mean_ious) + self.outputs = { + 'OutMeanIou': mean_iou, + 'OutWrong': out_wrong, + 'OutCorrect': out_correct + } + + def config(self): + self.num_classes = 10 + self.image_size = [128, 128] + self.in_wrong_num = 0 + self.in_correct_num = 0 + self.in_mean_iou_num = 0 + + def test_check_output(self): + self.check_output() + + +class TestCase1(TestMeanIOUOp): + def config(self): + self.num_classes = 5 + self.image_size = [100, 128] + self.in_wrong_num = 2 + self.in_correct_num = 2 + self.in_mean_iou_num = 2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py new file mode 100644 index 0000000000..f209bdf30f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py @@ -0,0 +1,38 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestMergeIdsOp(OpTest): + def setUp(self): + self.op_type = "merge_ids" + ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64') + x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32') + x1 = np.array([]).astype('float32') + x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6], + [0.5, 0.6]]).astype('float32') + out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3], + [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32') + self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 5b299a0f29..9c604170b8 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -515,35 +515,38 @@ class DistributeTranspiler: grad_to_block_id, None) # process distributed lookup_table - prefetch_block = None + prefetch_var_name_to_block_id = [] if self.has_distributed_lookup_table: pserver_index = self.pserver_endpoints.index(endpoint) table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) - prefetch_block = self._create_prefetch_block( + prefetch_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) # NOTE: if has_distributed_lookup_table is False, then prefetch_block will # not be executed, so it's safe to use optimize_block to hold the place if self.has_distributed_lookup_table: - assert prefetch_block is not None + assert len(prefetch_var_name_to_block_id) > 0 else: - assert prefetch_block is None - prefetch_block = pserver_program.global_block() + assert len(prefetch_var_name_to_block_id) == 0 + + attrs = { + "OptimizeBlock": pserver_program.block(1), + "endpoint": endpoint, + "Fanin": self.trainer_num, + "sync_mode": self.sync_mode, + "grad_to_block_id": grad_to_block_id + } + if len(prefetch_var_name_to_block_id) > 0: + attrs['prefetch_var_name_to_block_id'] \ + = prefetch_var_name_to_block_id # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", inputs={'X': recv_inputs}, outputs={}, - attrs={ - "OptimizeBlock": pserver_program.block(1), - "endpoint": endpoint, - "Fanin": self.trainer_num, - "PrefetchBlock": prefetch_block, - "sync_mode": self.sync_mode, - "grad_to_block_id": grad_to_block_id - }) + attrs=attrs) pserver_program.sync_with_cpp() return pserver_program @@ -608,8 +611,15 @@ class DistributeTranspiler: def _replace_lookup_table_op_with_prefetch(self, program, pserver_endpoints): # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op - self.prefetch_input_vars = None - self.prefetch_output_vars = None + # self.all_prefetch_input_vars = + # [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1] + # [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]] + self.all_prefetch_input_vars = [] + + # self.all_prefetch_input_vars = + # [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1] + # [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]] + self.all_prefetch_output_vars = [] continue_search_lookup_table_op = True while continue_search_lookup_table_op: @@ -619,26 +629,27 @@ class DistributeTranspiler: if op.type == LOOKUP_TABLE_TYPE: continue_search_lookup_table_op = True - op_index = list(all_ops).index(op) + lookup_table_op_index = list(all_ops).index(op) ids_name = op.input("Ids") out_name = op.output("Out") - if self.prefetch_input_vars is None: - ids_var = program.global_block().vars[ids_name[0]] - self.prefetch_input_vars = self.create_splited_vars( - source_var=ids_var, - block=program.global_block(), - tag="_prefetch_in_") - if self.prefetch_output_vars is None: - out_var = program.global_block().vars[out_name[0]] - self.prefetch_output_vars = self.create_splited_vars( - source_var=out_var, - block=program.global_block(), - tag="_prefetch_out_") + ids_var = program.global_block().vars[ids_name[0]] + prefetch_input_vars = self.create_splited_vars( + source_var=ids_var, + block=program.global_block(), + tag="_prefetch_in_") + self.all_prefetch_input_vars.append(prefetch_input_vars) + + out_var = program.global_block().vars[out_name[0]] + prefetch_output_vars = self.create_splited_vars( + source_var=out_var, + block=program.global_block(), + tag="_prefetch_out_") + self.all_prefetch_output_vars.append(prefetch_output_vars) # insert split_ids_op program.global_block().insert_op( - index=op_index, + index=lookup_table_op_index, type="split_ids", inputs={ 'Ids': [ @@ -646,14 +657,14 @@ class DistributeTranspiler: for varname in ids_name ] }, - outputs={"Out": self.prefetch_input_vars}) + outputs={"Out": prefetch_input_vars}) # insert prefetch_op program.global_block().insert_op( - index=op_index + 1, + index=lookup_table_op_index + 1, type="prefetch", - inputs={'X': self.prefetch_input_vars}, - outputs={"Out": self.prefetch_output_vars}, + inputs={'X': prefetch_input_vars}, + outputs={"Out": prefetch_output_vars}, attrs={ "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -661,16 +672,21 @@ class DistributeTranspiler: # insert concat_op program.global_block().insert_op( - index=op_index + 2, - type="concat", - inputs={'X': self.prefetch_output_vars}, + index=lookup_table_op_index + 2, + type="merge_ids", + inputs={ + 'Ids': [ + program.global_block().vars[varname] + for varname in ids_name + ], + 'X': prefetch_output_vars + }, outputs={ "Out": [ program.global_block().vars[varname] for varname in out_name ] - }, - attrs={"axis": 0}) + }) # delete lookup_table_op delete_ops(program.global_block(), [op]) @@ -709,30 +725,34 @@ class DistributeTranspiler: optimize_block): # STEP: create prefetch block table_var = pserver_program.global_block().vars[self.table_name] - prefetch_block = pserver_program.create_block(optimize_block.idx) - trainer_ids = self.prefetch_input_vars[pserver_index] - pserver_ids = pserver_program.global_block().create_var( - name=trainer_ids.name, - type=trainer_ids.type, - shape=trainer_ids.shape, - dtype=trainer_ids.dtype) - trainer_out = self.prefetch_output_vars[pserver_index] - pserver_out = pserver_program.global_block().create_var( - name=trainer_out.name, - type=trainer_out.type, - shape=trainer_out.shape, - dtype=trainer_out.dtype) - prefetch_block.append_op( - type="lookup_sparse_table", - inputs={'Ids': pserver_ids, - "W": table_var}, - outputs={"Out": pserver_out}, - attrs={ - "is_sparse": True, # has no effect on lookup_table op - "is_distributed": True, - "padding_idx": -1 - }) - return prefetch_block + prefetch_var_name_to_block_id = [] + for index in range(len(self.all_prefetch_input_vars)): + prefetch_block = pserver_program.create_block(optimize_block.idx) + trainer_ids = self.all_prefetch_input_vars[index][pserver_index] + pserver_ids = pserver_program.global_block().create_var( + name=trainer_ids.name, + type=trainer_ids.type, + shape=trainer_ids.shape, + dtype=trainer_ids.dtype) + trainer_out = self.all_prefetch_output_vars[index][pserver_index] + pserver_out = pserver_program.global_block().create_var( + name=trainer_out.name, + type=trainer_out.type, + shape=trainer_out.shape, + dtype=trainer_out.dtype) + prefetch_block.append_op( + type="lookup_sparse_table", + inputs={'Ids': pserver_ids, + "W": table_var}, + outputs={"Out": pserver_out}, + attrs={ + "is_sparse": True, # has no effect on lookup_table op + "is_distributed": True, + "padding_idx": -1 + }) + prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str( + prefetch_block.idx)) + return prefetch_var_name_to_block_id def _create_table_optimize_block(self, pserver_index, pserver_program, pre_block_idx, grad_to_block_id): diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index e6f87ce61b..4e3beaf639 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -240,14 +240,15 @@ class ExtraLayerAttribute(object): :type error_clipping_threshold: float :param drop_rate: Dropout rate. Dropout will create a mask on layer output. The dropout rate is the zero rate of this mask. The - details of what dropout is please refer to `here - `_. + details of what dropout is please refer to `JMLRdropout + `_. :type drop_rate: float :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU. - The details allocation in parallel_nn please refer to `here - `_. + The details allocation in parallel_nn please refer to `use_case + `_. :type device: int """ diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index ebc31b23e0..e6a03759ef 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2556,7 +2556,7 @@ def img_conv_layer(input, the output will be obtained by concatenating the two results. The details of grouped convolution, please refer to: - `ImageNet Classification with Deep Convolutional Neural Networks + `ImageNet Classification With Deep Convolutional Neural Networks `_ The example usage is: @@ -5678,8 +5678,8 @@ def warp_ctc_layer(input, `_ library, which is used in `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin `_, to compute Connectionist Temporal - Classification (CTC) loss. Besides, another `warp-ctc - `_ repository, which is forked from + Classification (CTC) loss. Besides, another `warp-ctc repository + `_ , which is forked from the official one, is maintained to enable more compiling options. During the building process, PaddlePaddle will clone the source codes, build and install it to :code:`third_party/install/warpctc` directory. diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py index d48c54fcbb..3c6a53db3c 100644 --- a/python/paddle/v2/minibatch.py +++ b/python/paddle/v2/minibatch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader. diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py index 48100e5bf9..54a6904626 100644 --- a/tools/codestyle/docstring_checker.py +++ b/tools/codestyle/docstring_checker.py @@ -126,9 +126,10 @@ class DocstringChecker(BaseChecker): 'W9002': ('Doc string does not end with "." period', symbol + "-end-with", 'Used when a doc string does not end with a period'), - 'W9003': ('All args with their types must be mentioned in doc string', - symbol + "-with-all-args", - 'Used when not all arguments are in the doc string '), + 'W9003': + ('All args with their types must be mentioned in doc string %s', + symbol + "-with-all-args", + 'Used when not all arguments are in the doc string '), 'W9005': ('Missing docstring or docstring is too short', symbol + "-missing", 'Add docstring longer >=10'), 'W9006': ('Docstring indent error, use 4 space for indent', @@ -178,6 +179,8 @@ class DocstringChecker(BaseChecker): self.indent_style(node) def missing_doc_string(self, node): + if node.name.startswith("__") or node.name.startswith("_"): + return True if node.tolineno - node.fromlineno <= 10: return True @@ -199,12 +202,16 @@ class DocstringChecker(BaseChecker): doc = node.doc lines = doc.splitlines() + line_num = 0 for l in lines: + if line_num == 0: + continue cur_indent = len(l) - len(l.lstrip()) if cur_indent % indent != 0: self.add_message('W9006', node=node, line=node.fromlineno) return False + line_num += 1 return True @@ -320,15 +327,19 @@ class DocstringChecker(BaseChecker): return True parsed_args = doc.args + args_not_documented = set(args) - set(parsed_args) if len(args) > 0 and len(parsed_args) <= 0: - print "debug:parsed args: ", parsed_args - self.add_message('W9003', node=node, line=node.fromlineno) + self.add_message( + 'W9003', + node=node, + line=node.fromlineno, + args=list(args_not_documented)) return False for t in args: if t not in parsed_args: - print t, " with (type) not in ", parsed_args - self.add_message('W9003', node=node, line=node.fromlineno) + self.add_message( + 'W9003', node=node, line=node.fromlineno, args=[t, ]) return False return True diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook index e7c92ba671..150a3f5666 100755 --- a/tools/codestyle/pylint_pre_commit.hook +++ b/tools/codestyle/pylint_pre_commit.hook @@ -7,13 +7,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export PYTHONPATH=$DIR:$PYTHONPATH # The trick to remove deleted files: https://stackoverflow.com/a/2413151 -for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do +for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do pylint --disable=all --load-plugins=docstring_checker \ --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done -#exit $TOTAL_ERRORS +exit $TOTAL_ERRORS #For now, just warning: -exit 0 +#exit 0